From 8a501d52ec23a1998d3aaeaefd4ec9b98468bf30 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 1 Dec 2021 22:16:14 +0100
Subject: [PATCH 01/15] support quantization fix some stupid bugs use opset 13
 (onnx)

---
 src/transformer_deploy/backends/ort_utils.py |  2 +-
 src/transformer_deploy/backends/trt_utils.py | 10 ++-
 src/transformer_deploy/convert.py            | 82 +++++++++++---------
 3 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/src/transformer_deploy/backends/ort_utils.py b/src/transformer_deploy/backends/ort_utils.py
index 5cf738c4..31c59e69 100644
--- a/src/transformer_deploy/backends/ort_utils.py
+++ b/src/transformer_deploy/backends/ort_utils.py
@@ -47,7 +47,7 @@ def convert_to_onnx(model_pytorch: PreTrainedModel, output_path: str, inputs_pyt
             model_pytorch,  # model to optimize
             args=tuple(inputs_pytorch.values()),  # tuple of multiple inputs
             f=output_path,  # output path / file object
-            opset_version=12,  # the ONNX version to use
+            opset_version=13,  # the ONNX version to use
             do_constant_folding=True,  # simplify model (replace constant expressions)
             input_names=list(inputs_pytorch.keys()),  # input names
             output_names=["output"],  # output axis name
diff --git a/src/transformer_deploy/backends/trt_utils.py b/src/transformer_deploy/backends/trt_utils.py
index 6034c601..d8b352c7 100644
--- a/src/transformer_deploy/backends/trt_utils.py
+++ b/src/transformer_deploy/backends/trt_utils.py
@@ -136,6 +136,8 @@ def build_engine(
     optimal_shape: Tuple[int, int],
     max_shape: Tuple[int, int],
     workspace_size: int,
+    fp16: bool,
+    int8: bool,
 ) -> ICudaEngine:
     with trt.Builder(logger) as builder:  # type: Builder
         with builder.create_network(
@@ -153,9 +155,11 @@ def build_engine(
                 config.set_tactic_sources(
                     tactic_sources=1 << int(trt.TacticSource.CUBLAS) | 1 << int(trt.TacticSource.CUBLAS_LT)
                 )
-                # config.set_flag(trt.BuilderFlag.INT8)
-                # config.set_quantization_flag(trt.QuantizationFlag.CALIBRATE_BEFORE_FUSION)
-                # config.int8_calibrator = Calibrator()
+                if int8:
+                    config.set_flag(trt.BuilderFlag.INT8)
+                    # config.set_quantization_flag(trt.QuantizationFlag.CALIBRATE_BEFORE_FUSION)
+                    # config.int8_calibrator = Calibrator()
+                # if fp16:
                 config.set_flag(trt.BuilderFlag.FP16)
                 config.set_flag(trt.BuilderFlag.DISABLE_TIMING_CACHE)
                 # https://github.com/NVIDIA/TensorRT/issues/1196 (sometimes big diff in output when using FP16)
diff --git a/src/transformer_deploy/convert.py b/src/transformer_deploy/convert.py
index cf4ef983..dd56c459 100644
--- a/src/transformer_deploy/convert.py
+++ b/src/transformer_deploy/convert.py
@@ -40,6 +40,7 @@
 )
 from transformer_deploy.benchmarks.utils import prepare_input, print_timings, setup_logging, track_infer_time
 from transformer_deploy.templates.triton import Configuration, ModelType
+from pytorch_quantization.nn import TensorQuantizer
 
 
 def main():
@@ -48,6 +49,7 @@ def main():
         description="optimize and deploy transformers", formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
     parser.add_argument("-m", "--model", required=True, help="path to model or URL to Hugging Face Hub")
+    parser.add_argument("-t", "--tokenizer", help="path to tokenizer or URL to Hugging Face Hub")
     parser.add_argument(
         "-b",
         "--batch-size",
@@ -64,6 +66,7 @@ def main():
         type=int,
         nargs=3,
     )
+    parser.add_argument("-q", "--quantization", action="store_true", help="int-8 GPU quantization support")
     parser.add_argument("-w", "--workspace-size", default=10000, help="workspace size in MiB (TensorRT)", type=int)
     parser.add_argument("-o", "--output", default="triton_models", help="name to be used for ")
     parser.add_argument("-n", "--name", default="transformer", help="model name to be used in triton server")
@@ -92,10 +95,12 @@ def main():
     tensorrt_path = os.path.join(args.output, "model.plan")
 
     assert torch.cuda.is_available(), "CUDA is not available. Please check your CUDA installation"
-    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer_path = args.tokenizer if args.tokenizer else args.model
+    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
     input_names: List[str] = tokenizer.model_input_names
     logging.info(f"axis: {input_names}")
     include_token_ids = "token_type_ids" in input_names
+
     model_pytorch: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(args.model)
     model_pytorch.cuda()
     model_pytorch.eval()
@@ -113,16 +118,41 @@ def main():
     logging.info(f"[Pytorch] input shape {inputs_pytorch['input_ids'].shape}")
     logging.info(f"[Pytorch] output shape: {output_pytorch.shape}")
     # create onnx model and compare results
+    if args.quantization:
+        TensorQuantizer.use_fb_fake_quant = True
     convert_to_onnx(model_pytorch=model_pytorch, output_path=onnx_model_path, inputs_pytorch=inputs_pytorch)
+    if args.quantization:
+        TensorQuantizer.use_fb_fake_quant = False
     onnx_model = create_model_for_provider(path=onnx_model_path, provider_to_use="CUDAExecutionProvider")
     output_onnx = onnx_model.run(None, inputs_onnx)
     assert np.allclose(a=output_onnx, b=output_pytorch, atol=args.atol)
     del onnx_model
-    if "pytorch" not in args.backend:
-        del model_pytorch
 
     timings = {}
 
+    if "pytorch" in args.backend:
+        with torch.inference_mode():
+            for _ in range(args.warmup):
+                _ = model_pytorch(**inputs_pytorch)
+                torch.cuda.synchronize()
+            time_buffer = []
+            for _ in range(args.nb_measures):
+                with track_infer_time(time_buffer):
+                    _ = model_pytorch(**inputs_pytorch)
+                    torch.cuda.synchronize()
+            timings["Pytorch (FP32)"] = time_buffer
+            with autocast():
+                for _ in range(args.warmup):
+                    _ = model_pytorch(**inputs_pytorch)
+                    torch.cuda.synchronize()
+                time_buffer = []
+                for _ in range(args.nb_measures):
+                    with track_infer_time(time_buffer):
+                        _ = model_pytorch(**inputs_pytorch)
+                        torch.cuda.synchronize()
+                timings["Pytorch (FP16)"] = time_buffer
+    del model_pytorch
+
     if "tensorrt" in args.backend:
         trt_logger: Logger = trt.Logger(trt.Logger.INFO if args.verbose else trt.Logger.WARNING)
         runtime: Runtime = trt.Runtime(trt_logger)
@@ -134,6 +164,8 @@ def main():
             optimal_shape=tensor_shapes[1],
             max_shape=tensor_shapes[2],
             workspace_size=args.workspace_size * 1024 * 1024,
+            fp16=True,
+            int8=args.quantization,
         )
         save_engine(engine=engine, engine_file_path=tensorrt_path)
         # important to check the engine has been correctly serialized
@@ -214,44 +246,22 @@ def main():
             timings[benchmar_name] = time_buffer
         del model
 
-        conf = Configuration(
-            model_name=args.name,
-            model_type=ModelType.ONNX,
-            batch_size=0,
-            nb_output=output_pytorch.shape[1],
-            nb_instance=args.nb_instances,
-            include_token_type=include_token_ids,
-            workind_directory=args.output,
-        )
-        conf.create_folders(tokenizer=tokenizer, model_path=onnx_optim_fp16_path)
-
-    if "pytorch" in args.backend:
-        with torch.inference_mode():
-            for _ in range(args.warmup):
-                _ = model_pytorch(**inputs_pytorch)
-                torch.cuda.synchronize()
-            time_buffer = []
-            for _ in range(args.nb_measures):
-                with track_infer_time(time_buffer):
-                    _ = model_pytorch(**inputs_pytorch)
-                    torch.cuda.synchronize()
-            timings["Pytorch (FP32)"] = time_buffer
-            with autocast():
-                for _ in range(args.warmup):
-                    _ = model_pytorch(**inputs_pytorch)
-                    torch.cuda.synchronize()
-                time_buffer = []
-                for _ in range(args.nb_measures):
-                    with track_infer_time(time_buffer):
-                        _ = model_pytorch(**inputs_pytorch)
-                        torch.cuda.synchronize()
-                timings["Pytorch (FP16)"] = time_buffer
-
     print(f"Inference done on {get_device_name(0)}")
     print("latencies:")
     for name, time_buffer in timings.items():
         print_timings(name=name, timings=time_buffer)
 
+    conf = Configuration(
+        model_name=args.name,
+        model_type=ModelType.ONNX,
+        batch_size=0,
+        nb_output=output_pytorch.shape[1],
+        nb_instance=args.nb_instances,
+        include_token_type=include_token_ids,
+        workind_directory=args.output,
+    )
+    conf.create_folders(tokenizer=tokenizer, model_path=onnx_optim_fp16_path)
+
 
 if __name__ == "__main__":
     main()

From 287bab0febf559069e3048ea73b9a0f2700cd889 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 1 Dec 2021 22:16:44 +0100
Subject: [PATCH 02/15] add quantization demo

---
 demo/text_classification_quantization.ipynb | 8074 +++++++++++++++++++
 1 file changed, 8074 insertions(+)
 create mode 100644 demo/text_classification_quantization.ipynb

diff --git a/demo/text_classification_quantization.ipynb b/demo/text_classification_quantization.ipynb
new file mode 100644
index 00000000..1a49b73d
--- /dev/null
+++ b/demo/text_classification_quantization.ipynb
@@ -0,0 +1,8074 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# A recipe to perform Nvidia GPU int-8 quantization on most transformers model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Recently, Nvidia added to Hugging Face `transformer` library a new model called `QDQBert`.\n",
+    "The single purpose of this model is to show how to add GPU quantization to vanilla Bert.\n",
+    "There are also some demo scripts to demonstrate the use of the model on SQuaD task.\n",
+    "\n",
+    "**GPU quantization is a way to double the inference speed of your GPU**.\n",
+    "It can be applied to any model in theory, and unlike distillation, if done well, it should not decrease your model accuracy.\n",
+    "\n",
+    "Unfortunately, these extreme perforamances are not easy to get, it requires some good knowledge of TensorRT API, ONNX export, or quantization process. The purpose of this tutorial is to show a good enough process to perform quantization.\n",
+    "\n",
+    "## What is int-8 quantization?\n",
+    "\n",
+    "Basic idea behind the expression int-8 quantization is that instead of doing deep learning computations with `float` numbers (usually encoded on 32 bits), you use integers (encoded on 8 bits). On a large matrix multiplication it has 2 effects:\n",
+    "\n",
+    "* it reduces by a large margin the size in memory, making **memory transfer faster** (on GPU, many operations are very fast to compute, and memory transfer is the main bottleneck, they are called memory bound)\n",
+    "* it also makes **computation faster** accelerating the slowest operations (in transformer, mainly big matrix multiplication during the self attention comptutation)\n",
+    "\n",
+    "A 8-bit integer can encode values from -128 to +127, and no decimal (as it's an integer).\n",
+    "So a 8-bit integer can't encode values like `1280.872654`.\n",
+    "\n",
+    "However we can use our integer if it's associated to a scale (a FP32 scale). For instance, for a scale of 20, I can set my integer to 64 (64*20=1280), it's not exactly `1280.872654` but it's close enough.\n",
+    "\n",
+    "That's why we need to perform a step called `calibration` during which the range of values and the scale (encoded as a FP32 float) will be computed.\n",
+    "\n",
+    "Basically, we know that by converting a FP32 to an int-8 and its scale, we will lose some information, and the goal of the calibration is to minimize this loss.\n",
+    "\n",
+    "If in a matrix, values go from -1.5 to +2, it may be encoded as an integer taking value from -128 to +127, associated to a scale of 64 (2*64=128)\n",
+    "\n",
+    "\n",
+    "[A good documentation on quantization](https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf)\n",
+    "\n",
+    "\n",
+    "## Why a dedicated tutorial?\n",
+    "\n",
+    "The code from Nvidia only supports out of the box vanilla `Bert` model (and not similar models, like RoBerta & co).\n",
+    "The demo from Nvidia is on the SQuaD task, it's cool but it makes the code a lot less clear that needed.\n",
+    "\n",
+    "To be both simple and cover most use cases, in this tutorial we will see:\n",
+    "\n",
+    "* how to perform GPU quantization on **any** transformer model (not just Bert) using a simple trick\n",
+    "* how to to apply quantization to a common task like classification (which is easier to understand than question answering)\n",
+    "* measure performance gain (latency)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dependencies installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We install `master` branch of `transfomers` library to use a new model: **QDQBert** and `transformer-deploy` to leverage `TensorRT` models (TensorRT API is not something simple to master, it's highly advised to use a wrapper)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "MOsHUjgdIrIW"
+   },
+   "outputs": [],
+   "source": [
+    "#! pip install git+https://github.com/huggingface/transformers\n",
+    "#! pip install git+https://github.com/ELS-RD/transformer-deploy\n",
+    "#! pip install sklearn datasets -U\n",
+    "#! pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "2OzrD4f-3ydk",
+    "outputId": "54cc2ea6-6969-4e01-f9f9-78c5fc91ff85"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wed Dec  1 18:59:19 2021       \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\r\n",
+      "|-------------------------------+----------------------+----------------------+\r\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
+      "|                               |                      |               MIG M. |\r\n",
+      "|===============================+======================+======================|\r\n",
+      "|   0  NVIDIA GeForce ...  On   | 00000000:03:00.0  On |                  N/A |\r\n",
+      "| 79%   60C    P8    52W / 350W |    311MiB / 24267MiB |      2%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "                                                                               \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| Processes:                                                                  |\r\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\r\n",
+      "|        ID   ID                                                   Usage      |\r\n",
+      "|=============================================================================|\r\n",
+      "|    0   N/A  N/A      1975      G   /usr/lib/xorg/Xorg                188MiB |\r\n",
+      "|    0   N/A  N/A      7865      G   /usr/bin/gnome-shell               40MiB |\r\n",
+      "|    0   N/A  N/A     35082      G   ...AAAAAAAAA= --shared-files       41MiB |\r\n",
+      "|    0   N/A  N/A    161588      G   ..._49620.log --shared-files       12MiB |\r\n",
+      "|    0   N/A  N/A    706814      G   ...AAAAAAAAA= --shared-files       25MiB |\r\n",
+      "+-----------------------------------------------------------------------------+\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check that the GPU is enabled\n",
+    "! nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rEJBSTyZIrIb"
+   },
+   "source": [
+    "# Fine-tuning a model on a text classification task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This part is inspired from [official Notebooks from Hugging Face](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "YZbiBDuGIrId"
+   },
+   "outputs": [],
+   "source": [
+    "GLUE_TASKS = [\"cola\", \"mnli\", \"mnli-mm\", \"mrpc\", \"qnli\", \"qqp\", \"rte\", \"sst2\", \"stsb\", \"wnli\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "zVvslsfMIrIh"
+   },
+   "outputs": [],
+   "source": [
+    "task = \"mnli\"\n",
+    "num_labels = 3 if task.startswith(\"mnli\") else 1 if task==\"stsb\" else 2\n",
+    "model_checkpoint = \"roberta-base\"\n",
+    "batch_size = 32\n",
+    "validation_key = \"validation_mismatched\" if task == \"mnli-mm\" else \"validation_matched\" if task == \"mnli\" else \"validation\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "whPRbBNbIrIl"
+   },
+   "source": [
+    "### Loading the dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "W7QYTpxXIrIl"
+   },
+   "source": [
+    "We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download the data and get the metric we need to use for evaluation (to compare our model to the benchmark). This can be easily done with the functions `load_dataset` and `load_metric`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "id": "IreSlFmlIrIm"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset glue (/home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d786beb2d0d2475f80ba9b915e2500a9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 392702\n",
+       "    })\n",
+       "    validation_matched: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 9815\n",
+       "    })\n",
+       "    validation_mismatched: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 9832\n",
+       "    })\n",
+       "    test_matched: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 9796\n",
+       "    })\n",
+       "    test_mismatched: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 9847\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset, load_metric\n",
+    "import datasets\n",
+    "\n",
+    "actual_task = \"mnli\" if task == \"mnli-mm\" else task\n",
+    "dataset = load_dataset(\"glue\", actual_task)\n",
+    "metric = load_metric('glue', actual_task)\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "n9qywopnIrJH"
+   },
+   "source": [
+    "### Preprocessing the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YVx71GdAIrJH"
+   },
+   "source": [
+    "Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.\n",
+    "\n",
+    "To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:\n",
+    "\n",
+    "- we get a tokenizer that corresponds to the model architecture we want to use,\n",
+    "- we download the vocabulary used when pretraining this specific checkpoint.\n",
+    "\n",
+    "That vocabulary will be cached, so it's not downloaded again the next time we run the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "referenced_widgets": [
+      "b6be028de2ae4ff691538eedb33793af",
+      "a3e2c73d393d4e58a371f3da3dd80e6d",
+      "b4d3f284fc4c4061b58d43a738f9bc78",
+      "8a11c8fed672470b8335dc575a4a220e",
+      "08286a6371584b4186014ecb5d5f164d",
+      "68c4c867096d41a78740fdee30edcadb",
+      "7d520bdde27742abb42803843721d101",
+      "f8a0053903c64e75ac25eab5b24d5871",
+      "93dbcc6d23a743bab0da8af6ee5e2825",
+      "d1ecc3d380fc4758b03190b23686a2f1",
+      "2d3a08166846438db79b0f89314fe76a",
+      "5e2185bd6e4f4a10b89ac606868a43bd",
+      "f44d2beebfe44186b0ac8016e89e4b49",
+      "2eac6b4817e14d7fae396e6458b940fa",
+      "af16284f77594397a69ad0e322b5e736",
+      "a20579a9e7364fb485d79bdc4feb54dc",
+      "cae29b9c6d45412fab70977fcd0f3234",
+      "927ad6ade85a402594074fa90ab558c2",
+      "30646fa2c0dc494e9dbcbd4dc598410e",
+      "7a75099f99054645bf3fc1b778dac7e6",
+      "d5d015711ae04d2f801577fc50af6c15",
+      "4b13c3b3435f4689b29d48e0a35bebd6",
+      "be4affe852b348de8fe1362582b08da9",
+      "c6c100b71f26405fb960598feb5eee03",
+      "99e94791043b4499b06601f7524f9b14",
+      "26bc2038bed74279813ab5af09a2724c",
+      "9bc6e14b912249e3b7d02f31bcc74667",
+      "196ffc99ad5a40109d9b1cfe12032b62",
+      "d5c8ff9e3bd849059fa7b30eab5fc940",
+      "7ff32d18c9f0473893a6a6b2941c54b0",
+      "0022faf286b44e858e638ccd5ded38b0",
+      "6e54ce781ca54ad283911fa4774e3361",
+      "969b6fdac1d6418d89a683db1e6ec6b2",
+      "092db03992f24951b494fbb81da5b9d6",
+      "023900ca566446eab5905b25b16a3de7",
+      "994cf2338c7c4899952e25723445693c",
+      "6aa2f5d46f1f454198d8e69517549ff1",
+      "72b8f11065254e5ca488cd346b5add54",
+      "c7bd52ef524c4d279dfcaa3aebe4a2c5",
+      "d9a0852554284d36b6b121f579b06b41",
+      "4320b12de9d14c459cc88319e2d7622a",
+      "7b483d17d1d14fdd922600f0c906fc2f",
+      "14648b8262944f5faac134a7c0184e47",
+      "10678736bd534c63aebda414da01b4db"
+     ]
+    },
+    "id": "eXNLu_-nIrJI",
+    "outputId": "10b2f739-6277-44c2-fd31-0de3a9ab9fa8"
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Vl6IidfdIrJK"
+   },
+   "source": [
+    "We pass along `use_fast=True` to the call above to use one of the fast tokenizers (backed by Rust) from the 🤗 Tokenizers library. Those fast tokenizers are available for almost all models, but if you got an error with the previous call, remove that argument."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "fyGdtK9oIrJM"
+   },
+   "outputs": [],
+   "source": [
+    "task_to_keys = {\n",
+    "    \"cola\": (\"sentence\", None),\n",
+    "    \"mnli\": (\"premise\", \"hypothesis\"),\n",
+    "    \"mnli-mm\": (\"premise\", \"hypothesis\"),\n",
+    "    \"mrpc\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"qnli\": (\"question\", \"sentence\"),\n",
+    "    \"qqp\": (\"question1\", \"question2\"),\n",
+    "    \"rte\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"sst2\": (\"sentence\", None),\n",
+    "    \"stsb\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"wnli\": (\"sentence1\", \"sentence2\"),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xbqtC4MrIrJO"
+   },
+   "source": [
+    "We can double check it does work on our current dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "19GG646uIrJO",
+    "outputId": "b9d1e5e8-21ca-43ea-85c6-f4315d50d96e"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence 1: Conceptually cream skimming has two basic dimensions - product and geography.\n",
+      "Sentence 2: Product and geography are what make cream skimming work. \n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence1_key, sentence2_key = task_to_keys[task]\n",
+    "if sentence2_key is None:\n",
+    "    print(f\"Sentence: {dataset['train'][0][sentence1_key]}\")\n",
+    "else:\n",
+    "    print(f\"Sentence 1: {dataset['train'][0][sentence1_key]}\")\n",
+    "    print(f\"Sentence 2: {dataset['train'][0][sentence2_key]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2C0hcmp9IrJQ"
+   },
+   "source": [
+    "We can them write the function that will preprocess our samples. We just feed them to the `tokenizer` with the argument `truncation=True`. This will ensure that an input longer that what the model selected can handle will be truncated to the maximum length accepted by the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "id": "vc0BSBLIIrJQ"
+   },
+   "outputs": [],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    if sentence2_key is None:\n",
+    "        return tokenizer(examples[sentence1_key], truncation=True, padding=\"max_length\", max_length=256)\n",
+    "    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding=\"max_length\", max_length=256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "referenced_widgets": [
+      "65017db07d7f4e798ede741cc92488f0",
+      "6fa74604c68543a38392fa0e1587f707",
+      "86cc326e574a4fada7224e6f0c209e9a",
+      "af5b646f89024c139c695a1f058fb772",
+      "37cda4cae81a4d94aa831fb40b5c3b26",
+      "28b7346a9b8c4b198dd9dbea1be013b6",
+      "561b1ede331a40c1a2bff9422e8eea0e",
+      "aecf7f063234416abf3f24766481cb89",
+      "21ef195fa88f49c4a2c057f8028177a2",
+      "5b1ad9f5d02c4b298a02ce6041692057",
+      "56fd7584b0844590936519ec3851922e",
+      "bbe3a471efb04ea8b5aabc4be819d585",
+      "59418bbeb20547e5b5e1a5728262c757",
+      "a61d366d91c34697a55f62b754e1f3a5",
+      "1bea379404df429b9852b62a938661ae",
+      "c801e1727de44b67aa7cb1c3d970e1fe",
+      "b8722dc10d4447fe9630cbf169260cc8",
+      "a9b98fd93fcd4fc4a2b2aa88c82835d0",
+      "300f01e3547648f3983a83d3d3118c54",
+      "a4c444f06c0847c09a44917084d3908d",
+      "7c875ecd9cb54405a6c45969bcb4b4c6",
+      "4552ee8ca6bd4a0b956651cc23f4ff3c",
+      "3bfff454943b4b04a12ec29bbe28e0aa",
+      "154200a8bc0b44fe8d0419fd56c6539d",
+      "cedca6e55b84443e82f3d01471d61048",
+      "a7d355f456eb4d3995dd91c5917a72c1",
+      "b264b220d9c444bd9da46a7e6c8fd5ed",
+      "4fae966b76844c869cdea1e53891e26f",
+      "a0a2918e9772475cac51124b3b83fcaf",
+      "a02624219ee84f50b1a3032eaa030a39",
+      "5f032f56105f463a8680aa2482d0b162",
+      "7701ec898fd443f1b35b187aea3651e9",
+      "8399339998564d21ba5db6f0514c02c6"
+     ]
+    },
+    "id": "DDtsaJeVIrJT",
+    "outputId": "0eeb1cb2-e308-493b-807e-532eeae5f4fe"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1f1a13d917d99a50.arrow\n",
+      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9b9dbd19d82c4713.arrow\n",
+      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b37b7241dc97daf7.arrow\n",
+      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-f78b5169435f1ed4.arrow\n",
+      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-87c8e6fc7a3e0678.arrow\n"
+     ]
+    }
+   ],
+   "source": [
+    "encoded_dataset = dataset.map(preprocess_function, batched=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "545PP3o8IrJV"
+   },
+   "source": [
+    "## Fine-tuning the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "FBiW8UpKIrJW"
+   },
+   "source": [
+    "Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about sentence classification, we use the `AutoModelForSequenceClassification` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us. The only thing we have to specify is the number of labels for our problem (which is always 2, except for STS-B which is a regression problem and MNLI where we have 3 labels):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000,
+     "referenced_widgets": [
+      "ac14ba24dcf3404db9fd303dbb24d7a5",
+      "4e91efae49b64f038fd3fbfcfd2be510",
+      "17b83e0d0fb947d7bf20319ff930e8fc",
+      "1da1d80871f545bbb21bf5a84d2120a0",
+      "c593f2e45e244637821cc5721788bf2c",
+      "cbbb20b5d01a4450bfb8dfbf8048d64f",
+      "854cfd13416543fba8221093b903658b",
+      "7ec6da801d0d45c4bb80eeab5518e124",
+      "8585eab4b3fe4992bd7e7c4596e2483b",
+      "990482eebca2424bb5ecbd114007e02c",
+      "c92a19dfa84142af91522bc22f21fca6",
+      "78601982b0e04b80adaa502db2ef685a",
+      "167874df55014291be95cd390b1e60d3",
+      "d6426fea2eda41dd9a31cb3f35b0877e",
+      "163146c2f23440bcbf782116a35b5684",
+      "0dab554959dc44b3b313ee8ae91ca88d",
+      "f651eecbb6d44c24820cf6fe5ab92e7b",
+      "a51b461c062f4636bfa4b48823d0709b",
+      "cced5f1cccc2400a8fbfd7a6eaedc666",
+      "cf9597523c024514b9b3e66bc77e3fa8",
+      "f01fdef82047471e8c1b780cae5379cc",
+      "e1f08cf954ae4aea818c90d893486c77"
+     ]
+    },
+    "id": "KPMoLPBn_1vN",
+    "outputId": "58dca4e7-fc5c-4fd1-a8d4-755aa1e956cb"
+   },
+   "outputs": [],
+   "source": [
+    "import pytorch_quantization.nn as quant_nn\n",
+    "from pytorch_quantization.tensor_quant import QuantDescriptor\n",
+    "import numpy as np\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from typing import Dict, OrderedDict\n",
+    "import torch\n",
+    "from torch import Tensor\n",
+    "from transformers import (\n",
+    "    AutoModelForSequenceClassification,\n",
+    "    PreTrainedModel,\n",
+    "    QDQBertForSequenceClassification,\n",
+    "    BertForSequenceClassification,\n",
+    "    TrainingArguments,\n",
+    "    Trainer,\n",
+    "    IntervalStrategy,\n",
+    ")\n",
+    "import pytorch_quantization\n",
+    "from pytorch_quantization import calib\n",
+    "import shutil"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def convert_to_onnx(model_pytorch: PreTrainedModel, output_path: str, inputs_pytorch: Dict[str, torch.Tensor]) -> None:\n",
+    "    with torch.no_grad():\n",
+    "        torch.onnx.export(\n",
+    "            model_pytorch,  # model to optimize\n",
+    "            args=(inputs_pytorch[\"input_ids\"], inputs_pytorch[\"attention_mask\"]),  # tuple of multiple inputs , inputs_pytorch[\"token_type_ids\"]\n",
+    "            f=output_path,  # output path / file object\n",
+    "            opset_version=13,  # the ONNX version to use\n",
+    "            do_constant_folding=True,  # simplify model (replace constant expressions)\n",
+    "            input_names=[\"input_ids\", \"attention_mask\"],  # input names \"token_type_ids\"\n",
+    "            output_names=[\"model_output\"],  # output name\n",
+    "            dynamic_axes={  # declare dynamix axis for each input / output (dynamic axis == variable length axis)\n",
+    "                \"input_ids\": {0: \"batch_size\", 1: \"sequence\"},\n",
+    "                \"attention_mask\": {0: \"batch_size\", 1: \"sequence\"},\n",
+    "                #\"token_type_ids\": {0: \"batch_size\", 1: \"sequence\"},\n",
+    "                \"model_output\": {0: \"batch_size\"},\n",
+    "            },\n",
+    "            verbose=False,\n",
+    "        )\n",
+    "\n",
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    if task != \"stsb\":\n",
+    "        predictions = np.argmax(predictions, axis=1)\n",
+    "    else:\n",
+    "        predictions = predictions[:, 0]\n",
+    "    return metric.compute(predictions=predictions, references=labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric_name = \"pearson\" if task == \"stsb\" else \"matthews_correlation\" if task == \"cola\" else \"accuracy\"\n",
+    "model_name = model_checkpoint.split(\"/\")[-1]\n",
+    "\n",
+    "nb_step = 1000\n",
+    "strategy = IntervalStrategy.STEPS\n",
+    "args = TrainingArguments(\n",
+    "    f\"{model_name}-finetuned-{task}\",\n",
+    "    evaluation_strategy = strategy,\n",
+    "    eval_steps=nb_step,\n",
+    "    logging_steps=nb_step,\n",
+    "    save_steps=nb_step,\n",
+    "    save_strategy = strategy,\n",
+    "    learning_rate=1e-5,  # 7.5e-6 https://github.com/pytorch/fairseq/issues/2057#issuecomment-643674771\n",
+    "    per_device_train_batch_size=batch_size,\n",
+    "    per_device_eval_batch_size=batch_size*2,\n",
+    "    num_train_epochs=1,\n",
+    "    fp16=True,\n",
+    "    group_by_length=False,\n",
+    "    weight_decay=0.01,\n",
+    "    load_best_model_at_end=True,\n",
+    "    metric_for_best_model=metric_name,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Transplant weights from one model into Bert architecture\n",
+    "\n",
+    "First, you need to know that not all models are quantization compliant. The optimization engine (`TensorRT`) search for some patterns and will fail to opimize the model if it doesn't find them. It requires the code to be written in a certain way. For that reason we will try to reuse what works.\n",
+    "\n",
+    "We will leverage the fact that since Bert have been released, very few improvements have been brought to the transformer architecture (at least encoder only models).\n",
+    "Indeed, better model appeared, and most of the work has been done to improve the pretraining step.\n",
+    "So the idea will be to take the weights from those new models and put them inside Bert.\n",
+    "\n",
+    "The reason of this process is to avoid the modification of the source code of these others model.\n",
+    "Copy-pasting quantization part of QDQModel to another one is not hard (there are only few blocks modified) but would require some work on the user side, making quantization harder that it should be.\n",
+    "The process described below is not perfect but should work for most users.\n",
+    "\n",
+    "**steps**:\n",
+    "\n",
+    "* load Bert model\n",
+    "* retrieve layer/weight names\n",
+    "* load target model (here Roberta)\n",
+    "* replace weight/layer names with those from Roberta\n",
+    "* override the architecture name in model configuration\n",
+    "\n",
+    "If there is no 1 to 1 correspondance (it happens), try to keep at least embeddings and self attention. Of course, it's possible that if a model is very different, the transplant may cost some accuracy. In our experience, if your trainset is big enough it should not happen.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "roberta-base\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model_checkpoint)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
+      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias']\n",
+      "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_bert = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=num_labels)\n",
+    "bert_keys = list(model_bert.state_dict().keys())\n",
+    "del model_bert\n",
+    "\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
+    "model.save_pretrained(\"roberta-in-bert\")\n",
+    "del model\n",
+    "model_weights: OrderedDict[str, Tensor] = torch.load(\"roberta-in-bert/pytorch_model.bin\")\n",
+    "\n",
+    "\n",
+    "# a too simple check\n",
+    "# IRL, check layer names and find a way to map self attention and embeddings from the original model to Bert\n",
+    "assert len(model_weights) == len(bert_keys)\n",
+    "\n",
+    "for bert_key in bert_keys:\n",
+    "    # pop remove the first weights from the Ordered dict ...\n",
+    "    _, weight = model_weights.popitem(last=False)\n",
+    "    # ... and we re-insert them, in order, with a new key\n",
+    "    model_weights[bert_key] = weight\n",
+    "\n",
+    "# we re-export the weights\n",
+    "torch.save(model_weights, \"roberta-in-bert/pytorch_model.bin\")\n",
+    "del model_weights\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We override the architecture name to make `transformers` believe it is Bert..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =====> change architecture to bert base <======\n",
+    "import json\n",
+    "\n",
+    "with open(\"roberta-in-bert/config.json\") as f:\n",
+    "    content = json.load(f)\n",
+    "    content['architectures'] = [\"bert\"]\n",
+    "\n",
+    "with open(\"roberta-in-bert/config.json\", mode=\"w\") as f:\n",
+    "    json.dump(content, f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model training\n",
+    "\n",
+    "\n",
+    "When you create a classification model from a pretrained one, the last layer are randomly initialized.\n",
+    "We don't want to take these totally random values to compute the calibration of tensors.\n",
+    "Moreover, our trainset is a bit small, and it's easy to overfit.\n",
+    "\n",
+    "Therefore, we train our `Roberta into Bert` model on 1/6 of the train set.\n",
+    "The goal is to slightly update the weights to the new architecture, not to get the best score.\n",
+    "\n",
+    "> another approach is to fully train your model, perform calibration, and then retrain it on a small part of the data with a low learning rate (usually 1/10 of the original one).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.\n",
+      "max_steps is given, it will override any value given in num_train_epochs\n",
+      "Using amp half precision backend\n",
+      "The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running training *****\n",
+      "  Num examples = 392702\n",
+      "  Num Epochs = 1\n",
+      "  Instantaneous batch size per device = 32\n",
+      "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
+      "  Gradient Accumulation steps = 1\n",
+      "  Total optimization steps = 2000\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='2000' max='2000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [2000/2000 07:21, Epoch 0/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>0.723500</td>\n",
+       "      <td>0.532824</td>\n",
+       "      <td>0.792562</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>0.549100</td>\n",
+       "      <td>0.483588</td>\n",
+       "      <td>0.809068</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-1000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-1000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-1000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-1000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-1000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-2000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-2000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-2000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-2000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-2000/special_tokens_map.json\n",
+      "\n",
+      "\n",
+      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "\n",
+      "\n",
+      "Loading best model from roberta-base-finetuned-mnli/checkpoint-2000 (score: 0.8090677534386144).\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='154' max='154' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [154/154 00:18]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Configuration saved in roberta-in-bert-trained/config.json\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.4835878908634186, 'eval_accuracy': 0.8090677534386144, 'eval_runtime': 19.1183, 'eval_samples_per_second': 513.384, 'eval_steps_per_second': 8.055, 'epoch': 0.16}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Model weights saved in roberta-in-bert-trained/pytorch_model.bin\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_bert = BertForSequenceClassification.from_pretrained(\"roberta-in-bert\", num_labels=num_labels)\n",
+    "model_bert = model_bert.cuda()\n",
+    "\n",
+    "args.max_steps = 2000\n",
+    "trainer = Trainer(\n",
+    "    model_bert,\n",
+    "    args,\n",
+    "    train_dataset=encoded_dataset[\"train\"],\n",
+    "    eval_dataset=encoded_dataset[validation_key],\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "print(trainer.evaluate())\n",
+    "model_bert.save_pretrained(\"roberta-in-bert-trained\")\n",
+    "del trainer\n",
+    "del model_bert"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Quantization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below we will start the quantization process.\n",
+    "It follow those steps:\n",
+    "\n",
+    "* perform the calibration\n",
+    "* perform a quantization aware training\n",
+    "\n",
+    "By passing validation values to the model, we will calibrate it, meaning it will get the right range / scale to convert FP32 weights to int-8 ones."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Calibration"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Activate histogram calibration\n",
+    "\n",
+    "There are several kinds of calbrators, below we use the percentile one (99.99p) (`histogram`), basically, its purpose is to just remove the most extreme values before computing range / scale.\n",
+    "The other option is `max`, it's much faster but expect lower accuracy.\n",
+    "\n",
+    "Second calibration option, choose between calibration done at the tensor level or per channel (more fine grained, slower)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# you can also use \"max\" instead of \"historgram\"\n",
+    "input_desc = QuantDescriptor(num_bits=8, calib_method=\"histogram\")\n",
+    "# below we do per-channel quantization for weights, set axis to None to get a per tensor calibration\n",
+    "weight_desc = QuantDescriptor(num_bits=8, axis=(0,))\n",
+    "quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)\n",
+    "quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Perform calibration\n",
+    "\n",
+    "During this step we will enable the calibration nodes, and pass some representative data to the model.\n",
+    "It will then be used to compute the scale/range."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file roberta-in-bert-trained/config.json\n",
+      "You are using a model of type bert to instantiate a model of type qdqbert. This is not supported for all configurations of models and can yield errors.\n",
+      "Model config QDQBertConfig {\n",
+      "  \"_name_or_path\": \"roberta-in-bert\",\n",
+      "  \"architectures\": [\n",
+      "    \"BertForSequenceClassification\"\n",
+      "  ],\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"bos_token_id\": 0,\n",
+      "  \"classifier_dropout\": null,\n",
+      "  \"eos_token_id\": 2,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"id2label\": {\n",
+      "    \"0\": \"LABEL_0\",\n",
+      "    \"1\": \"LABEL_1\",\n",
+      "    \"2\": \"LABEL_2\"\n",
+      "  },\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"label2id\": {\n",
+      "    \"LABEL_0\": 0,\n",
+      "    \"LABEL_1\": 1,\n",
+      "    \"LABEL_2\": 2\n",
+      "  },\n",
+      "  \"layer_norm_eps\": 1e-05,\n",
+      "  \"max_position_embeddings\": 514,\n",
+      "  \"model_type\": \"qdqbert\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"pad_token_id\": 1,\n",
+      "  \"position_embedding_type\": \"absolute\",\n",
+      "  \"problem_type\": \"single_label_classification\",\n",
+      "  \"torch_dtype\": \"float32\",\n",
+      "  \"transformers_version\": \"4.13.0.dev0\",\n",
+      "  \"type_vocab_size\": 1,\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 50265\n",
+      "}\n",
+      "\n",
+      "loading weights file roberta-in-bert-trained/pytorch_model.bin\n",
+      "I1201 19:07:22.556784 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.557633 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.558312 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.559234 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.570187 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.570979 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.572141 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.573173 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.587157 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.587976 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.588648 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.589257 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.589886 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.590491 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.591093 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.591677 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.604110 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.604965 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.605788 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.606476 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.608637 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.609386 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.633528 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.634159 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.634558 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.635272 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.659617 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.660160 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.660545 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.661419 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.662083 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.662428 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.672487 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.672940 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.673371 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.673864 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.686383 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.687393 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.688224 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.689111 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.701097 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.702034 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.702666 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.703149 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.703629 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.704051 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.704477 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.704874 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.716984 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.717729 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.718373 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.718999 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.719850 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.720462 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.741302 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.741899 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.742321 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.742763 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.769119 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.769680 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.770210 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.771095 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.771780 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.772149 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.783096 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.783622 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.784070 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.784542 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.794926 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:22.795491 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.795853 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.796230 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.807137 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.807927 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.808553 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.809141 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.809771 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.810495 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.811114 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.811691 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.822019 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.822805 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.823449 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.824043 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.824914 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.825503 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.846135 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.846942 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.847608 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.848223 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.871000 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.871783 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.872462 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.873089 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.873962 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.874621 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.884854 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.885457 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.886066 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.886688 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.899109 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.899768 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.900748 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.901568 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.911406 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.912059 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.912927 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.913561 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.914347 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.915078 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.915945 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.916547 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.927549 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.928246 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.928908 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.929500 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.930580 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.931228 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.951412 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.952154 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.952960 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.953933 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.974907 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.975683 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.976436 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.977675 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:22.978751 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.979352 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.991469 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:22.992411 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:22.993380 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:22.994207 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.005038 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.005791 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.006553 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.007203 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.017255 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.017829 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.018506 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.019221 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.019879 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.020379 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.020857 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.021332 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.031261 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.031948 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.032585 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.033192 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.034027 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.034641 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.058038 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:23.058811 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.059446 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.060050 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.079895 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.080400 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.080774 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.081119 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.081963 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.082303 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.092197 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.092940 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.093587 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.094182 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.104809 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.105530 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.106288 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.107035 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.117332 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.117931 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.118697 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.119329 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.120085 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.120682 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.121240 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.121816 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.131671 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.132045 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.132399 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.132738 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.133545 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.133875 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.153260 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.153972 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.154779 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.155387 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.179506 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.180439 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.181103 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.181725 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.182586 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.183223 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.194004 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.195079 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.196422 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.196865 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.208515 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.209109 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.209630 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.210128 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.221330 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.221704 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.222061 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.222406 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.222798 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.223171 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.223591 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.223986 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.235521 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.236109 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.236990 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.237547 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.238431 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.239129 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.262135 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.262805 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.263666 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.264140 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.287677 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.288471 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.289235 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.290506 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.291714 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.292328 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.303590 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.304338 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.304985 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.305601 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.316265 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.316930 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.317543 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.318154 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:23.328983 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.329701 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.330348 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.331055 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.331630 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.332135 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.332654 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.333236 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.343668 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.344560 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.345513 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.346140 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.346822 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.347609 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.370694 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.371253 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.371692 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.372114 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.395322 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.396442 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.397025 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.397402 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.398226 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.398845 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.412521 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.413233 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.413730 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.414105 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.424964 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.425662 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.426278 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.426805 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.437678 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.438106 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.438483 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.438826 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.439212 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.439549 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.439886 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.440611 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.451024 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.451713 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.452302 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.452893 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.453708 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.454290 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.479227 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.480020 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.480688 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.481298 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.503707 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.504791 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.505503 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.506218 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.507561 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.508140 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.519340 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.519988 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.520587 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.521208 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.534733 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.535418 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.536240 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.537214 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.548016 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.548620 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.549396 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.550175 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.550990 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.551932 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.552865 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.553758 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.564883 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.565361 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.565732 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.566088 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.567098 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.567529 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.590969 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.592016 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.592971 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.593897 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:23.619890 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.620731 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.621435 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.622109 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.623023 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.623664 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.634196 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.634888 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.635725 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.636688 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.657259 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.657959 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.658629 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.659231 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.672862 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.673597 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.674302 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.674978 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.675721 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.676385 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.677051 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.677688 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.689925 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.690466 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.690964 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.691452 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.692585 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.693069 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.717059 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.717769 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.718341 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.718904 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.741673 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.742345 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.742954 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.743494 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.744401 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.745055 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.755685 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.756275 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.756812 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.757349 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.767277 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.767896 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.768507 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.769093 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.779711 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.780525 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.781466 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.782547 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.783240 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.783874 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.784468 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.785077 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.795897 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.796726 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.797205 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.797985 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.798608 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.798972 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.822418 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.823235 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.823901 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.824537 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.847674 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:07:23.848276 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:07:23.848831 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.849364 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:07:23.850354 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:07:23.851019 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "All model checkpoint weights were used when initializing QDQBertForSequenceClassification.\n",
+      "\n",
+      "All the weights of QDQBertForSequenceClassification were initialized from the model checkpoint at roberta-in-bert-trained.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use QDQBertForSequenceClassification for predictions without further training.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dc9c4f7d994a43238fc98b0a4a82a76b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:24.008212 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.008661 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.008988 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.009320 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.009638 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.009947 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.010272 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.010650 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.011093 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.011521 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.011932 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.012334 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.012732 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.013158 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.013524 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.015070 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.015362 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.015657 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.015948 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.016235 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.016535 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.016821 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.017107 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.017389 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.017682 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.017965 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.018249 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.018543 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.018842 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.019130 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.019419 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.019713 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.020005 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.020288 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.020587 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.020866 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.021157 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.021440 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.021725 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.022008 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.022335 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.022688 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.023005 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.023291 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.023587 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.023878 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.024167 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.024456 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.024747 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.025036 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.025317 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.025598 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.025888 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.026176 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.026575 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.029913 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.030232 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.030552 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.030859 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.031172 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.031473 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.031761 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.032055 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.032341 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.032632 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.032923 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.033217 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.033504 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.033799 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.034101 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.034383 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.034724 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.035037 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.035333 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.035627 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.035917 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.036211 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.036498 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.036792 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.037077 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.037374 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.037710 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.038072 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.038429 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.038773 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.039572 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.040231 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.040872 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.041512 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.042466 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.042991 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.043453 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.043893 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.044385 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.044853 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:24.045287 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.045724 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.046156 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.046822 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.047271 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.047720 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.048171 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.048631 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.049073 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.049527 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.049985 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.050450 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.050935 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.051433 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.051962 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.052431 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.052842 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.053302 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.053708 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.054098 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.054503 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.054916 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.055331 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.055722 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.056115 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.056530 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.056917 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.057300 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.057678 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.058071 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.058478 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.058875 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.059275 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.059674 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.060062 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.060468 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.060877 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.061275 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.061657 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.062041 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.062422 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.062809 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.063184 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.063562 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.063941 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.064331 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.064721 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.065103 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.065503 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.065927 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.066313 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.066744 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.067138 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.067527 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.067911 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.068341 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.068727 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.069152 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.069553 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.069929 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.070332 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.070743 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.071141 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.071579 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.072017 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.072420 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.072801 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.073190 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.073567 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.073958 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.074336 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.074850 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.075454 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.075989 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.076356 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.076695 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.077060 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.077408 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.077838 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.078272 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.078711 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.079138 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.079486 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.079871 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.080217 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.080585 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.080916 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.081352 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.081695 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.082050 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.082368 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.082800 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.083172 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.083823 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:24.084179 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.084587 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.084959 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.092437 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.093339 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.094649 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.099028 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.099557 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.100039 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.100597 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.100946 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.101308 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.101635 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.102412 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.102815 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.103189 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.103542 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.103915 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.104271 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.104646 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.105073 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.105480 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.105823 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.106204 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.106556 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.106930 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.107255 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.107614 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.107955 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.108338 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.108674 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.109040 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.109416 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.109760 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.110115 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.110567 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.110929 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.111307 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.111885 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.112311 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.112704 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.115879 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.116308 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.116706 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.117094 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.117460 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.117829 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.118185 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.118570 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.118924 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.119296 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.119665 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.120041 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.120388 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.120766 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.121122 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.121490 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.121852 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.122225 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.122589 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.122989 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.123348 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.123733 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.125864 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.126237 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.126607 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.127273 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.127610 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.128101 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.128500 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.128862 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.129232 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.129593 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.129964 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.130370 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.131626 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.132100 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.132482 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.132864 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.133640 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.134005 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.134386 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.134849 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.135259 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.135945 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.136400 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.136772 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.137124 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.137450 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.137776 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.138093 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.138446 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.138790 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.139108 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.139424 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:24.139753 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.140066 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.140388 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.140700 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.141023 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.141333 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.141650 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.141971 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.142299 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.142667 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.142989 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.143304 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.143644 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.144004 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.144335 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.144657 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.144972 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.145276 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.145586 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.145884 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.146196 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.146507 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.146830 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.147159 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.147554 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.148074 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.148501 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.148970 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.149340 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.149654 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.149964 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.150276 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.150603 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.150904 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.151208 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.151526 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.151910 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.152282 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.152642 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.153004 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.153381 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.153741 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.154193 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.154572 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.154937 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.155293 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.155663 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.156019 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.156385 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.156755 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.157115 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.157469 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.157838 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.158190 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.158565 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.158932 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.159324 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.159684 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.160052 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.160408 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.160774 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.161141 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.161501 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.161860 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.162240 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.162612 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.163017 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.163409 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.163841 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.164212 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.165174 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.165847 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.166251 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.166768 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.167108 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.167433 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.167788 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.168111 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.168436 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.168760 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.169080 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.169394 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.169714 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.170035 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.170361 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.170723 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.171069 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.171420 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.171787 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.172127 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.172460 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.172802 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.173132 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.173461 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.173796 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:24.174127 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.174479 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.174819 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.175151 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.175482 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.175847 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.176198 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.176537 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.176899 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.177248 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.177613 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.177950 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.178279 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.178630 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.178949 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.179286 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.179632 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.179993 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.180335 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.180680 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.181045 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.181429 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.181766 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.182112 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.182474 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.182854 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.183246 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.183611 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.183982 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.184353 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.184703 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.185055 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.185404 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.185768 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.186109 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.186471 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.186816 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.187159 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.187501 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.187842 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.188180 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.188540 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.188892 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.189239 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.189582 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.189941 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.190281 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.190637 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.190984 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.191337 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.191706 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.192056 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.192404 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.192754 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.193092 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.193433 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.193799 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.194635 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.195046 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.195410 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.195775 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.196141 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.196555 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.196925 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.197286 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.197643 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.198002 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.198354 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.198724 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.199099 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.199455 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.199810 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.200161 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.200533 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.200872 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.201215 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.201560 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.201900 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.202245 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.202603 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.202946 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.203296 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.203634 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.203979 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.204316 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.204662 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.204997 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.205339 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.205677 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.206024 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.206361 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.206710 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.207053 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.207409 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.207747 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:24.208091 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.208422 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
+      "I1201 19:07:24.208777 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.209120 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
+      "I1201 19:07:24.209468 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
+      "I1201 19:07:24.209815 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fee5af3b54564a6d9cf01e84a99b2e34",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/4 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:07:24.249449 140057592104768 histogram.py:69] Calibrator encountered negative values. It shouldn't happen after ReLU. Make sure this is the right tensor to calibrate.\n",
+      "I1201 19:07:24.322341 140057592104768 max.py:60] Calibrator encountered negative values. It shouldn't happen after ReLU. Make sure this is the right tensor to calibrate.\n",
+      "W1201 19:11:25.924171 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "W1201 19:11:25.925113 140057592104768 tensor_quantizer.py:238] Call .cuda() if running on GPU after loading calibrated amax.\n",
+      "I1201 19:11:25.925732 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.926429 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.926960 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.927570 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.928478 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.929265 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.929759 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.930281 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.930794 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.931309 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.931917 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.932722 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.933320 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.933851 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.935375 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.936755 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.937237 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.938004 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.938564 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.939165 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.940878 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.941240 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.941617 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.942476 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.943017 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.943430 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.943952 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.944334 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.944707 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.945207 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.945593 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.945962 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.946330 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.946732 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.947101 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.948596 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.948939 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.949308 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.949787 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.950204 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.950607 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.951201 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.952383 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.952890 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.953278 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:25.953657 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.954019 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.954503 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.954883 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.955243 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.955616 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.955986 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.956342 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.956829 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.957189 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.957549 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.959388 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.959772 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.960143 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.960639 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.961327 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.961693 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.962077 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.962476 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.962854 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.963334 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.963722 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.964087 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.964465 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.964836 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.965192 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.966062 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.967302 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.967694 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.968163 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.968578 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.968949 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.969423 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.969820 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.970181 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.971506 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:11:25.971887 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.972258 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.973086 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.973466 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.973832 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.974312 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.974988 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.975597 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.976069 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.976484 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.977218 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.977980 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.978734 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.979220 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.979803 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.980222 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.980617 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.981182 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.983010 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.983466 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.984857 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.985381 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.985968 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.986622 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:25.987133 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.987572 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.988140 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.989069 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.989492 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.990689 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.991171 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.991599 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.992151 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.992578 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.993011 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.994227 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.994624 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.995114 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.996038 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.996421 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.996902 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:25.997333 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:25.997764 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.998190 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:25.998763 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:25.999198 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:25.999616 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.000059 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.000492 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.000916 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.001345 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.001671 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.002368 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.002961 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.003427 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.003854 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.004457 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.004840 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.005265 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.005782 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.008827 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.009523 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.010430 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.011012 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.011617 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.012572 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.013204 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.013811 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.014614 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.015094 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.015529 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.015997 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.016479 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.016898 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.018180 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.018682 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.019285 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.020093 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.020559 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.021015 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.021934 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.022428 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.022884 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.023341 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.023761 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.024163 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.025327 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:11:26.025758 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.026160 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.026600 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.027027 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.027422 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.028437 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.028844 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.029254 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.030018 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.030419 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.030812 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.031607 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.031997 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.032397 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.032814 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.033215 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.033596 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.034605 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.035006 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.035410 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.035823 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.036223 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.036654 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.037652 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.038041 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.038457 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.038881 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.039296 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.039678 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.040672 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.041069 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.041475 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.042263 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.042666 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.043061 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.043828 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.044213 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.044603 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.045393 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.045792 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.046184 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.046998 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.047406 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.047809 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.048225 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.048639 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.049020 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.050037 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.050438 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.051250 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.052364 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.052836 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.053314 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.054255 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.054712 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.055194 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.055634 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.056124 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.056620 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.057900 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.058500 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.058978 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.059447 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.059946 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.060393 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.062382 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.063033 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.063494 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.064432 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.064884 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.065327 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.066207 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.066751 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.067196 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.067848 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.068386 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.068830 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.069777 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.070216 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.070670 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.071287 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.071821 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.072246 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.073431 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.073822 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.074197 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.074593 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:11:26.074990 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.075353 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.076299 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.076682 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.077061 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.077822 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.078284 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.078689 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.079439 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.079832 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.080327 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.080979 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.081813 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.082364 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.083022 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.083666 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.084136 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.084990 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.085494 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.085910 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.086870 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.087291 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.087764 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.088692 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.089114 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.089588 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.090586 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.091059 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.091480 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.091969 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.092467 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.092957 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.094229 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.094680 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.095173 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.095866 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.096348 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.096778 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.097883 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.098311 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.098751 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.099691 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.100139 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.100568 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.101492 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.101877 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.102281 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.102719 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.103145 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.103531 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.104698 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.105144 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.105577 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.106037 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.106483 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.106919 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.107499 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.108504 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.108941 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.109427 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.109966 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.110471 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.111276 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.112012 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.112491 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.113208 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.113619 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.114001 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.114477 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.114864 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.115222 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.115676 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.117420 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.117794 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.118367 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.118748 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.119553 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.119925 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.120306 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.120646 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.121500 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.121852 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.122227 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.123033 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.123393 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.123810 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "calibration\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 19:11:26.124270 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.124982 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.125337 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.125682 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.126024 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.126360 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.126821 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.127168 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.127512 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.127857 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.128201 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.128534 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.130064 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.130435 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.130790 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.131241 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.132012 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.132446 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.133346 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.133914 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.134443 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.134913 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.135293 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.135643 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.136127 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.136558 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.136909 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.137258 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.137603 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.137944 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.138386 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.138761 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.139133 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.139504 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.139860 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.140220 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.142096 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.142473 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.142842 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.143534 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.144126 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.144506 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.144970 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.145310 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.145627 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.146053 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.146389 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.146715 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.147159 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.147586 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.147955 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.148300 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.148642 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.149002 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.150712 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.151114 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.151485 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.152328 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.152645 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.152975 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.153701 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.154030 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.154361 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.154783 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.155122 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.155440 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.155856 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.156772 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.157073 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.157397 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.157718 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.158034 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.158441 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.159212 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.159476 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.159883 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.160531 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.160816 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.161237 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.161567 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.161873 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.162191 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.162541 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.162867 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.163278 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.163607 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.163912 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 19:11:26.164236 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.164555 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.164862 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.165282 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.166933 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.167483 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.167847 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.168207 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.168518 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.169337 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.169656 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.169963 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.170650 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.170993 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.171299 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.171710 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.172366 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.172665 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.173085 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.173456 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.173775 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.174204 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.174545 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.174849 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.175172 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.175492 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.175794 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.177029 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.177344 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.177689 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.178099 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.178759 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.179049 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.179460 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.179778 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.180079 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.180411 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.180738 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.181051 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.181462 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.181764 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.182077 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.182408 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.182742 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.183166 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.185135 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.185495 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.185858 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.186303 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.186998 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.187295 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.187743 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.188093 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.188408 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.188751 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.189090 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.189402 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.189831 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.190161 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.190501 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.190845 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.191191 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.191504 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.191941 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.192252 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.192575 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.192912 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.193243 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.193555 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.195458 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.195774 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.196097 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.196519 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.197222 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.197522 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.198172 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.198496 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.198834 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.199260 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.199592 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.199915 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.200341 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.200659 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.201111 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.201585 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.201978 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.202338 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 19:11:26.203794 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.204129 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.204463 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.205213 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.205532 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.205856 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.206314 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.206920 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.207242 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.207595 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.207930 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.208251 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.208701 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.209014 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.209342 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.209681 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.210017 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.210337 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.210809 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.211127 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.211452 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.212960 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.213278 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.213597 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.214320 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.214647 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.214969 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.215307 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.215657 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.215990 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.216437 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.216790 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.217120 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.217470 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.218750 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.219104 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.219779 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.220130 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.220471 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.220828 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.221171 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.221508 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.222415 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.222771 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.223115 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.223901 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.224342 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.224699 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.225436 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.225807 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.226158 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.226867 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.227223 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.227569 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.228272 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.228614 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.228950 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.229305 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.229648 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.229976 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.230901 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.231261 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.231602 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.232302 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.232648 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.232999 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.233716 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.234098 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.234535 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.234896 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.235247 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.235577 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.236498 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.236858 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.237208 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.237540 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.237884 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.238211 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.239141 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.239490 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.239825 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.240611 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.241239 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.241616 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.242406 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.242787 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.243120 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 19:11:26.243480 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.243837 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.244161 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.245077 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.245416 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.245760 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.246103 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.246461 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.246803 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.247718 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.248057 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.248391 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.248733 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.249096 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.249467 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.250488 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.250917 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.251284 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.251981 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.252321 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.252659 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.253424 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.253845 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.254222 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.255028 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.255452 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.255849 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.256643 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.257052 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.257397 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.257740 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.258353 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.258865 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.259701 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.260312 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.260719 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.261451 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.261847 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.262240 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.263064 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.263426 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.263779 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.264135 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.264503 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.264840 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.265755 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.266181 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.266593 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.267354 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.267729 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.268307 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.268963 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.269316 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.269653 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.270340 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.270828 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.271171 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.271849 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.272221 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.272570 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.272931 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.273329 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.273719 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.274829 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.275363 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.275777 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.276152 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.276521 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.276897 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.277840 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.278287 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.278729 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.279121 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.279488 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.279873 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.280927 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.281318 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.281684 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.282429 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.282856 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.283243 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.284163 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.284554 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.284918 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.285676 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.286061 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.286433 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 19:11:26.287186 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.287554 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.287908 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.288288 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.288911 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.289278 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.289983 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.290372 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.290744 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.291477 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.291844 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.292200 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.292936 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.293301 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.293728 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.294102 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
+      "I1201 19:11:26.294512 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.294884 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.295844 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.296213 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.296561 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.296933 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
+      "I1201 19:11:26.297616 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.297944 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
+      "W1201 19:11:26.298643 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.299016 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.299408 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "W1201 19:11:26.300342 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
+      "I1201 19:11:26.300858 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
+      "W1201 19:11:26.301294 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
+      "Configuration saved in roberta-in-bert-trained-quantized/config.json\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert.encoder.layer.0.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=4.3667 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.2286, 0.7135](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=4.3667 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.2130, 0.8616](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=4.3667 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0553, 0.3021](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=20.9483 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=19.1295 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.9434 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.9990 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.7202 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0895, 0.8283](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=2.3058 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=4.3667 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=16.3581 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0793, 0.9986](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=4.7207 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1532, 1.0122](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=8.5718 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.0.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=16.3581 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=15.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1721, 0.5387](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=15.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1740, 0.7034](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=15.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0696, 0.3664](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.7817 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.8797 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=4.3953 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.9966 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.9805 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0581, 0.8137](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=2.8595 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=15.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=24.1278 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0945, 0.9753](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=2.5225 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1333, 1.0380](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=5.5873 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.1.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=24.1278 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.4631 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1529, 0.5227](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.4631 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1589, 0.6583](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.4631 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0926, 0.5707](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.8016 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=8.9038 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.5037 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.8067 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.3959 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0550, 0.5825](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=1.5141 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=22.4631 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=25.4978 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0963, 0.6521](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=5.4928 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1195, 0.9864](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=8.5919 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.2.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=25.4978 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=19.6996 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1667, 0.6095](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=19.6996 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1594, 0.6760](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=19.6996 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1110, 0.4958](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=11.2620 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.0273 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.9673 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.6460 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.3340 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0491, 0.6411](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=1.4833 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=19.6996 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=23.9052 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0916, 0.6947](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=5.3703 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1261, 1.0282](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=4.7693 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.3.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=23.9052 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=18.7359 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1819, 0.6175](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=18.7359 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1740, 0.6784](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=18.7359 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0960, 0.3665](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.7623 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.3514 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.9243 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5537 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.2916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0238, 0.5530](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=2.6874 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=18.7359 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.2396 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0951, 0.6780](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=6.2925 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.0993, 1.0144](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=5.8652 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.4.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.2396 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=20.8635 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1435, 0.5507](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=20.8635 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1387, 0.6393](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=20.8635 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1045, 0.3715](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.5283 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.4352 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.9734 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5589 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.6919 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0134, 0.5827](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=1.9668 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=20.8635 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.9879 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0820, 0.6016](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=2.8023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1060, 1.0209](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=3.0022 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.5.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.9879 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=21.0328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1642, 0.6050](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=21.0328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1604, 0.6274](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=21.0328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1190, 0.4425](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.7586 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.1608 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.0164 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5824 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.1733 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0716, 0.5062](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=2.7819 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=21.0328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.0411 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1003, 0.6700](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=2.4391 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1192, 1.0108](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=5.5336 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.6.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.0411 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.1518 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1625, 0.6395](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.1518 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1711, 0.6029](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.1518 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0622, 0.3252](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=8.8614 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.6074 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.7348 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.6704 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.4070 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0548, 0.6221](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=4.4532 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=22.1518 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.6406 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0800, 0.7407](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=2.6607 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1217, 1.2869](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=7.7671 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.7.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.6406 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.6328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1686, 0.5366](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.6328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1653, 0.6490](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.6328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0951, 0.3531](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.3184 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.5218 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5794 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.8379 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0489, 0.5719](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=6.6549 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=22.6328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=21.8385 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0863, 0.5819](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=3.4752 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1235, 1.3031](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=7.2133 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.8.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=21.8385 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=21.9573 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1544, 0.5243](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=21.9573 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1549, 0.5846](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=21.9573 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0839, 0.3537](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.2343 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=8.9173 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.0295 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5832 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.2727 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0700, 0.5093](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=6.9703 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=21.9573 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=20.2072 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0692, 0.5374](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=3.5973 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1114, 1.0897](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=6.9660 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.9.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=20.2072 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.query._input_quantizer                      TensorQuantizer(8bit fake per-tensor amax=21.2327 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.query._weight_quantizer                     TensorQuantizer(8bit fake axis=(0,) amax=[0.1554, 0.5088](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.key._input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=21.2327 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.key._weight_quantizer                       TensorQuantizer(8bit fake axis=(0,) amax=[0.1604, 0.5558](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.value._input_quantizer                      TensorQuantizer(8bit fake per-tensor amax=21.2327 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.value._weight_quantizer                     TensorQuantizer(8bit fake axis=(0,) amax=[0.0912, 0.2958](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.matmul_q_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=7.7721 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.matmul_k_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=8.4632 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.matmul_v_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=3.1891 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.self.matmul_a_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=0.6085 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.output.dense._input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=2.7075 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.output.dense._weight_quantizer                   TensorQuantizer(8bit fake axis=(0,) amax=[0.0828, 0.5451](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.output.add_local_input_quantizer                 TensorQuantizer(8bit fake per-tensor amax=8.5943 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.attention.output.add_residual_input_quantizer              TensorQuantizer(8bit fake per-tensor amax=21.2327 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.intermediate.dense._input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=16.0322 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.intermediate.dense._weight_quantizer                       TensorQuantizer(8bit fake axis=(0,) amax=[0.0847, 0.5931](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.output.dense._input_quantizer                              TensorQuantizer(8bit fake per-tensor amax=2.9404 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.output.dense._weight_quantizer                             TensorQuantizer(8bit fake axis=(0,) amax=[0.1075, 1.0234](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.output.add_local_input_quantizer                           TensorQuantizer(8bit fake per-tensor amax=4.3903 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.10.output.add_residual_input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=16.0322 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.query._input_quantizer                      TensorQuantizer(8bit fake per-tensor amax=14.1766 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.query._weight_quantizer                     TensorQuantizer(8bit fake axis=(0,) amax=[0.1734, 0.5519](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.key._input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=14.1766 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.key._weight_quantizer                       TensorQuantizer(8bit fake axis=(0,) amax=[0.1755, 0.5546](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.value._input_quantizer                      TensorQuantizer(8bit fake per-tensor amax=14.1766 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.value._weight_quantizer                     TensorQuantizer(8bit fake axis=(0,) amax=[0.1063, 0.3849](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.matmul_q_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=8.6127 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.matmul_k_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=7.4551 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.matmul_v_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=3.4774 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.self.matmul_a_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=0.6208 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.output.dense._input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=3.7673 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.output.dense._weight_quantizer                   TensorQuantizer(8bit fake axis=(0,) amax=[0.0971, 0.5766](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.output.add_local_input_quantizer                 TensorQuantizer(8bit fake per-tensor amax=9.5412 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.attention.output.add_residual_input_quantizer              TensorQuantizer(8bit fake per-tensor amax=14.1766 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.intermediate.dense._input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=8.4928 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.intermediate.dense._weight_quantizer                       TensorQuantizer(8bit fake axis=(0,) amax=[0.0982, 0.3916](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.output.dense._input_quantizer                              TensorQuantizer(8bit fake per-tensor amax=4.6082 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.output.dense._weight_quantizer                             TensorQuantizer(8bit fake axis=(0,) amax=[0.0956, 1.0014](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.output.add_local_input_quantizer                           TensorQuantizer(8bit fake per-tensor amax=14.0034 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "bert.encoder.layer.11.output.add_residual_input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=8.4928 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+      "240 TensorQuantizers found in model\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Model weights saved in roberta-in-bert-trained-quantized/pytorch_model.bin\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained\", num_labels=num_labels)\n",
+    "model_q = model_q.cuda()\n",
+    "\n",
+    "# Find the TensorQuantizer and enable calibration\n",
+    "for name, module in tqdm(model_q.named_modules()):\n",
+    "    if isinstance(module, quant_nn.TensorQuantizer):\n",
+    "        if module._calibrator is not None:\n",
+    "            module.disable_quant()\n",
+    "            module.enable_calib()\n",
+    "        else:\n",
+    "            module.disable()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    for start_index in tqdm(range(0, 4*batch_size, batch_size)):\n",
+    "        end_index = start_index + batch_size\n",
+    "        data = encoded_dataset[\"train\"][start_index:end_index]\n",
+    "        input_torch = {k: torch.tensor(list(v), dtype=torch.long, device=\"cuda\")\n",
+    "                       for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
+    "        model_q(**input_torch)\n",
+    "\n",
+    "\n",
+    "print(\"calibration\")\n",
+    "# Finalize calibration\n",
+    "for name, module in model_q.named_modules():\n",
+    "    if isinstance(module, quant_nn.TensorQuantizer):\n",
+    "        if module._calibrator is not None:\n",
+    "            if isinstance(module._calibrator, calib.MaxCalibrator):\n",
+    "                module.load_calib_amax()\n",
+    "            else:\n",
+    "                module.load_calib_amax(\"percentile\", percentile=99.99)\n",
+    "            module.enable_quant()\n",
+    "            module.disable_calib()\n",
+    "        else:\n",
+    "            module.enable()\n",
+    "\n",
+    "model_q.cuda()\n",
+    "\n",
+    "count = 0\n",
+    "for name, mod in model_q.named_modules():\n",
+    "    if isinstance(mod, pytorch_quantization.nn.TensorQuantizer):\n",
+    "        print(f\"{name:80} {mod}\")\n",
+    "        count += 1\n",
+    "print(f\"{count} TensorQuantizers found in model\")\n",
+    "model_q.save_pretrained(\"roberta-in-bert-trained-quantized\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quantization aware training\n",
+    "\n",
+    "The query aware training is not a mandatory step, but highly recommended to get the best accuracy. Basically we will redo the training with the quantization enabled."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "id": "imY1oC3SIrJf"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file roberta-in-bert-trained-quantized/config.json\n",
+      "Model config QDQBertConfig {\n",
+      "  \"_name_or_path\": \"roberta-in-bert-trained\",\n",
+      "  \"architectures\": [\n",
+      "    \"QDQBertForSequenceClassification\"\n",
+      "  ],\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"bos_token_id\": 0,\n",
+      "  \"classifier_dropout\": null,\n",
+      "  \"eos_token_id\": 2,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"id2label\": {\n",
+      "    \"0\": \"LABEL_0\",\n",
+      "    \"1\": \"LABEL_1\",\n",
+      "    \"2\": \"LABEL_2\"\n",
+      "  },\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"label2id\": {\n",
+      "    \"LABEL_0\": 0,\n",
+      "    \"LABEL_1\": 1,\n",
+      "    \"LABEL_2\": 2\n",
+      "  },\n",
+      "  \"layer_norm_eps\": 1e-05,\n",
+      "  \"max_position_embeddings\": 514,\n",
+      "  \"model_type\": \"qdqbert\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"pad_token_id\": 1,\n",
+      "  \"position_embedding_type\": \"absolute\",\n",
+      "  \"problem_type\": \"single_label_classification\",\n",
+      "  \"torch_dtype\": \"float32\",\n",
+      "  \"transformers_version\": \"4.13.0.dev0\",\n",
+      "  \"type_vocab_size\": 1,\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 50265\n",
+      "}\n",
+      "\n",
+      "loading weights file roberta-in-bert-trained-quantized/pytorch_model.bin\n",
+      "I1201 19:11:27.208793 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.209632 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.210307 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.211149 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.223644 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.224489 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.225447 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.226618 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.241246 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.242141 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.243034 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.243859 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.244722 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.245520 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.246686 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.247766 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.264419 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.265261 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.265923 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.266575 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.267451 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.268172 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.287400 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.288151 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.288789 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.289382 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.309565 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.310387 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.311098 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.311751 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.312860 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.313377 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.324157 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.324974 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.325625 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.326216 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.336715 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.337255 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.337689 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.338090 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.347661 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.348753 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.349757 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.350883 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.351833 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.352656 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.353345 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.353943 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.364943 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.365485 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.365902 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.366509 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.367229 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.368165 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.388085 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.388865 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.389516 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.390258 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.410784 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.411631 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.412297 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.413084 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.414086 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.414799 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.425738 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.426646 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.427272 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.427896 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.438486 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.439234 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:11:27.439879 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.440475 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.450418 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.451315 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.451903 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.452303 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.452749 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.453151 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.453549 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.453957 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.465221 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.465954 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.466644 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.467072 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.468285 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.468727 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.489801 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.490633 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.491233 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.491766 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.514694 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.515455 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.516121 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.516760 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.518003 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.518579 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.528168 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.528827 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.529607 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.530612 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.540609 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.541207 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.541832 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.542459 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.552270 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.552871 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.553499 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.554122 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.554818 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.555415 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.556003 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.556731 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.569805 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.570413 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.570830 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.571321 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.572377 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.572752 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.592881 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.593365 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.593739 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.594096 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.614654 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.615172 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.615548 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.615923 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.616773 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.617163 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.627428 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.628115 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.628840 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.629502 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.638837 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.639514 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.640233 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.640931 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.650447 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.651207 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.651915 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.652822 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.653721 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.654456 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.655095 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.655695 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.666741 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.667589 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.668479 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.669285 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.670184 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.670979 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.691772 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.692621 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:11:27.693244 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.693841 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.714617 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.715979 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.716634 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.717667 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.719031 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.719666 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.731079 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.731843 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.732547 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.733175 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.746136 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.746871 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.747591 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.748189 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.760656 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.761277 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.762013 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.762758 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.763410 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.763994 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.764589 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.765226 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.779591 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.780524 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.781319 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.781931 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.782832 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.783449 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.805895 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.806693 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.807322 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.807956 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.834656 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.835458 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.836114 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.836716 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.838033 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.838794 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.849754 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.850384 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.851008 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.851848 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.866665 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.867449 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.868165 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.868940 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.878483 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.879086 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.879875 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.880830 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.881797 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.882521 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.883174 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.883836 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.895553 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.895962 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.896406 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.896936 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.898256 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.898759 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.919639 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.920164 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.920756 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.921738 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.942070 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.942806 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.943764 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.944607 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.945640 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.946361 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.958386 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.958937 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.959405 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.959839 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.969992 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.970374 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.970793 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.971190 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.980629 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:11:27.980991 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.981393 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.981833 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.982313 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.982766 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.983162 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.983512 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.993079 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:27.993498 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:27.993848 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.994362 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:27.995240 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:27.995689 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.016281 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.016845 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.017403 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.017889 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.037999 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.038670 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.039304 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.039771 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.040842 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.041286 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.051465 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.052168 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.053006 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.053862 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.069291 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.069987 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.070711 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.071384 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.080609 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.081198 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.081861 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.082520 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.083215 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.084009 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.084729 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.085382 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.097800 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.099022 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.100077 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.100697 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.101861 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.102420 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.124801 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.125305 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.125720 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.126162 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.148860 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.149511 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.149907 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.150279 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.151454 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.151886 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.162329 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.162862 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.163242 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.163593 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.173374 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.173928 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.174362 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.174846 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.186318 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.186879 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.187222 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.187582 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.188061 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.188420 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.188758 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.189087 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.199641 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.200148 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.200556 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.200956 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.201748 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.202094 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.225237 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.225745 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.226214 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.226853 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.249388 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I1201 19:11:28.249994 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.250553 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.251643 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.253252 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.253765 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.265286 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.266306 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.267012 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.267527 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.278982 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.279747 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.280377 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.280992 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.292645 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.293354 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.294006 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.294641 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.295266 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.295871 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.296438 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.297021 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.308347 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.309117 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.309940 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.310598 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.311483 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.312092 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.334968 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.335727 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.336348 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.336934 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.360406 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.361209 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.362031 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.362898 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.363797 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.364415 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.375894 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.376536 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.377102 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.377916 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.390349 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.390866 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.391533 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.392543 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.403455 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.403934 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.404580 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.405518 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.406105 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.406684 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.407144 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.407640 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.417722 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.418179 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.418803 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.419415 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.420697 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.421152 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.444371 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.445104 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.445688 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.446305 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.469518 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
+      "I1201 19:11:28.470333 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
+      "I1201 19:11:28.470999 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.471604 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
+      "I1201 19:11:28.472492 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "I1201 19:11:28.473104 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
+      "Some weights of the model checkpoint at roberta-in-bert-trained-quantized were not used when initializing QDQBertForSequenceClassification: ['bert.encoder.layer.2.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.4.output.dense._input_quantizer._amax', 'bert.encoder.layer.5.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.3.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.5.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.1.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.8.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.5.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.1.output.dense._weight_quantizer._amax', 'bert.encoder.layer.10.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.5.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.10.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.2.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.5.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.6.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.7.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.3.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.5.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.5.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.4.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.3.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.2.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.6.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.2.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.9.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.10.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.1.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.9.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.2.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.0.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.5.output.dense._input_quantizer._amax', 'bert.encoder.layer.8.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.6.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.1.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.3.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.3.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.6.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.8.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.8.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.6.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.3.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.output.add_local_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.4.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.9.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.10.output.dense._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.3.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.3.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.1.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.1.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.11.output.dense._input_quantizer._amax', 'bert.encoder.layer.7.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.1.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.0.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.1.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.4.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.0.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.11.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.9.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.0.output.dense._input_quantizer._amax', 'bert.encoder.layer.0.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.10.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.10.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.4.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.7.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.5.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.6.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.10.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.6.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.1.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.4.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.1.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.9.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.7.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.5.output.dense._weight_quantizer._amax', 'bert.encoder.layer.2.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.0.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.5.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.9.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.0.output.dense._weight_quantizer._amax', 'bert.encoder.layer.1.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.8.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.8.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.11.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.4.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.9.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.6.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.1.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.7.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.4.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.8.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.2.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.8.output.dense._input_quantizer._amax', 'bert.encoder.layer.9.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.6.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.10.output.dense._input_quantizer._amax', 'bert.encoder.layer.6.output.add_local_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.9.output.dense._weight_quantizer._amax', 'bert.encoder.layer.5.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.3.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.6.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.2.output.add_local_input_quantizer._amax', 'bert.encoder.layer.3.output.add_local_input_quantizer._amax', 'bert.encoder.layer.3.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.6.output.dense._weight_quantizer._amax', 'bert.encoder.layer.7.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.11.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.1.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.0.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.3.output.dense._weight_quantizer._amax', 'bert.encoder.layer.9.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.9.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.1.output.add_local_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.3.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.7.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.0.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.2.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.8.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.8.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.output.dense._input_quantizer._amax', 'bert.encoder.layer.0.output.add_local_input_quantizer._amax', 'bert.encoder.layer.6.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.1.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.5.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.6.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.3.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.1.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.4.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.5.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.7.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.5.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.0.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.5.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.10.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.1.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.5.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.3.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.11.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.6.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.8.output.dense._weight_quantizer._amax', 'bert.encoder.layer.5.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.9.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.11.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.0.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.4.output.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.6.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.9.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.8.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.1.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.6.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.0.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.3.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.8.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.10.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.2.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.1.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.1.output.dense._input_quantizer._amax', 'bert.encoder.layer.7.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.5.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.3.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.7.output.dense._weight_quantizer._amax', 'bert.encoder.layer.3.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.2.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.5.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.8.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.4.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.7.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.2.output.dense._input_quantizer._amax', 'bert.encoder.layer.7.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.11.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.3.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.4.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.9.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.8.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.6.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.11.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.10.output.add_local_input_quantizer._amax', 'bert.encoder.layer.0.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.7.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.6.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.2.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.6.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.4.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.4.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.4.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.2.output.dense._weight_quantizer._amax', 'bert.encoder.layer.7.output.add_local_input_quantizer._amax', 'bert.encoder.layer.11.output.dense._weight_quantizer._amax', 'bert.encoder.layer.7.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.3.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.10.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.0.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.11.attention.self.value._input_quantizer._amax']\n",
+      "- This IS expected if you are initializing QDQBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing QDQBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "All the weights of QDQBertForSequenceClassification were initialized from the model checkpoint at roberta-in-bert-trained-quantized.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use QDQBertForSequenceClassification for predictions without further training.\n",
+      "Using amp half precision backend\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='308' max='154' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [154/154 07:31]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The following columns in the training set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running training *****\n",
+      "  Num examples = 392702\n",
+      "  Num Epochs = 1\n",
+      "  Instantaneous batch size per device = 32\n",
+      "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
+      "  Gradient Accumulation steps = 1\n",
+      "  Total optimization steps = 12272\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.5553710460662842, 'eval_accuracy': 0.7799286805909322, 'eval_runtime': 46.6334, 'eval_samples_per_second': 210.472, 'eval_steps_per_second': 3.302}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='12272' max='12272' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [12272/12272 1:22:46, Epoch 1/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>0.581400</td>\n",
+       "      <td>0.505601</td>\n",
+       "      <td>0.805807</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>0.542400</td>\n",
+       "      <td>0.481971</td>\n",
+       "      <td>0.811105</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3000</td>\n",
+       "      <td>0.510800</td>\n",
+       "      <td>0.469823</td>\n",
+       "      <td>0.823637</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4000</td>\n",
+       "      <td>0.494000</td>\n",
+       "      <td>0.459618</td>\n",
+       "      <td>0.821905</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5000</td>\n",
+       "      <td>0.482700</td>\n",
+       "      <td>0.418851</td>\n",
+       "      <td>0.837596</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6000</td>\n",
+       "      <td>0.471200</td>\n",
+       "      <td>0.417829</td>\n",
+       "      <td>0.836373</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7000</td>\n",
+       "      <td>0.460700</td>\n",
+       "      <td>0.431540</td>\n",
+       "      <td>0.834947</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8000</td>\n",
+       "      <td>0.460100</td>\n",
+       "      <td>0.402023</td>\n",
+       "      <td>0.847376</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9000</td>\n",
+       "      <td>0.457700</td>\n",
+       "      <td>0.396712</td>\n",
+       "      <td>0.846052</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10000</td>\n",
+       "      <td>0.435400</td>\n",
+       "      <td>0.398412</td>\n",
+       "      <td>0.846460</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>11000</td>\n",
+       "      <td>0.436800</td>\n",
+       "      <td>0.396119</td>\n",
+       "      <td>0.848701</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12000</td>\n",
+       "      <td>0.434900</td>\n",
+       "      <td>0.398557</td>\n",
+       "      <td>0.850229</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-1000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-1000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-1000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-1000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-1000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-2000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-2000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-2000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-2000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-2000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-3000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-3000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-3000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-3000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-3000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-4000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-4000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-4000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-4000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-4000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-5000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-5000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-5000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-5000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-5000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-6000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-6000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-6000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-6000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-6000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-7000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-7000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-7000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-7000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-7000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-8000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-8000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-8000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-8000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-8000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-9000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-9000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-9000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-9000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-9000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-10000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-10000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-10000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-10000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-10000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-11000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-11000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-11000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-11000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-11000/special_tokens_map.json\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n",
+      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-12000\n",
+      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-12000/config.json\n",
+      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-12000/pytorch_model.bin\n",
+      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-12000/tokenizer_config.json\n",
+      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-12000/special_tokens_map.json\n",
+      "\n",
+      "\n",
+      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading best model from roberta-base-finetuned-mnli/checkpoint-12000 (score: 0.8502292409577178).\n",
+      "W1201 20:35:01.955523 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.956298 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.959111 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.960055 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.961468 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.962328 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.963392 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.964326 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.965610 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.966427 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.969145 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.970091 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.971021 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.971580 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.973517 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.974354 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.977516 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.978431 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.979310 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.980026 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.981464 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.982098 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.983325 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.984021 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.985316 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.986423 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.987096 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.987734 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.988366 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.988989 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.990516 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.991376 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.992448 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.993382 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.995410 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.996055 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.997875 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.998544 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:01.999530 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.000184 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.001658 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.002353 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.004154 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.004740 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.005739 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.006352 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.007008 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.007576 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.008145 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.009447 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.010714 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.011306 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.012108 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.012622 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.014365 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.015107 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.017004 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.017748 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.output.dense._weight_quantizer: Overwriting amax.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 20:35:02.018628 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.019512 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.021052 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.021683 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.023229 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.024051 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.025203 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.025818 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.026520 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.027173 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.027965 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.028816 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.030057 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.030714 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.031604 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.032257 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.034212 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.035150 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.037409 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.038117 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.039025 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.039702 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.041222 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.041909 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.043179 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.043793 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.044975 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.045604 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.046368 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.047130 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.047795 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.048471 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.050607 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.051483 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.052273 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.053120 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.055149 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.055891 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.057919 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.058685 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.059737 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.060414 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.062059 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.062887 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.064355 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.065141 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.066732 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.067530 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.068627 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.069675 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.070315 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.071181 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.072673 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.073366 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.074441 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.075135 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.077220 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.078051 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.intermediate.dense._weight_quantizer: Overwriting amax.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 20:35:02.080285 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.081100 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.082259 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.082955 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.084679 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.085377 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.086830 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.087531 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.088868 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.089610 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.090454 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.091369 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.092094 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.093008 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.094101 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.094680 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.095411 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.096111 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.098433 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.099215 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.101495 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.102191 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.103269 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.104236 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.106216 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.107016 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.108289 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.109015 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.110199 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.110914 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.111627 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.112400 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.113051 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.113741 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.115241 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.116059 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.117117 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.117922 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.120249 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.121058 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.123253 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.124119 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.125084 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.125741 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.127210 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.127854 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.128997 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.129669 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.131155 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.131786 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.132460 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.132992 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.133857 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.134664 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.136404 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.137067 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.137895 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.138522 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.140558 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.intermediate.dense._input_quantizer: Overwriting amax.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 20:35:02.141115 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.143184 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.143835 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.144592 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.145140 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.146503 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.147159 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.148222 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.148817 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.149873 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.150564 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.151132 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.151804 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.152360 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.153022 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.154481 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.155050 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.156115 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.156681 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.158971 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.159622 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.161546 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.162128 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.163292 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.163882 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.165225 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.165837 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.166918 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.167514 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.168765 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.169459 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.170163 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.170852 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.171326 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.172083 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.173614 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.174386 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.175272 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.175909 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.177917 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.178691 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.181160 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.181823 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.182794 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.183488 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.184913 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.query._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.185632 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.query._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.186927 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.key._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.187760 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.key._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.189118 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.value._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.189894 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.value._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.190741 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.191475 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.192274 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.193039 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.194473 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.195123 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.196365 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.output.add_local_input_quantizer: Overwriting amax.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 20:35:02.197219 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.199527 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.intermediate.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.200487 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.intermediate.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.202985 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.output.dense._input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.203737 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.output.dense._weight_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.204701 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.output.add_local_input_quantizer: Overwriting amax.\n",
+      "W1201 20:35:02.205333 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.output.add_residual_input_quantizer: Overwriting amax.\n",
+      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 9815\n",
+      "  Batch size = 64\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='308' max='154' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [154/154 56:29]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.39855679869651794, 'eval_accuracy': 0.8502292409577178, 'eval_runtime': 47.3757, 'eval_samples_per_second': 207.174, 'eval_steps_per_second': 3.251, 'epoch': 1.0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized\", num_labels=num_labels)\n",
+    "model_q = model_q.cuda()\n",
+    "\n",
+    "args.max_steps = -1\n",
+    "trainer = Trainer(\n",
+    "    model_q,\n",
+    "    args,\n",
+    "    train_dataset=encoded_dataset[\"train\"],\n",
+    "    eval_dataset=encoded_dataset[validation_key],\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")\n",
+    "print(trainer.evaluate())\n",
+    "trainer.train()\n",
+    "print(trainer.evaluate())\n",
+    "model_q.save_pretrained(\"roberta-in-bert-trained-quantized-bis\")\n",
+    "del model_q"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Latency measures\n",
+    "\n",
+    "Let's see if what we have done is useful...\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TensorRT quantized model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1201 21:32:19.778870 140057592104768 tensor_quantizer.py:280] Use Pytorch's native experimental fake quantization.\n",
+      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization-2.1.2-py3.9-linux-x86_64.egg/pytorch_quantization/nn/modules/tensor_quantizer.py:285: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  inputs, amax.item() / bound, 0,\n",
+      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization-2.1.2-py3.9-linux-x86_64.egg/pytorch_quantization/nn/modules/tensor_quantizer.py:291: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  quant_dim = list(amax.shape).index(list(amax_sequeeze.shape)[0])\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pytorch_quantization.nn import TensorQuantizer\n",
+    "TensorQuantizer.use_fb_fake_quant = True\n",
+    "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized-bis\", num_labels=num_labels)\n",
+    "model_q = model_q.cuda()\n",
+    "print(trainer.evaluate())\n",
+    "convert_to_onnx(model_q, output_path=\"model_q.onnx\", inputs_pytorch=input_torch)\n",
+    "TensorQuantizer.use_fb_fake_quant = False\n",
+    "del model_q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!/usr/src/tensorrt/bin/trtexec --onnx=model_q.onnx --shapes=input_ids:1x384,attention_mask:1x384 --best --workspace=6000"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TensorRT baseline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "baseline_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
+    "baseline_model = baseline_model.cuda()\n",
+    "convert_to_onnx(baseline_model, output_path=\"baseline.onnx\", inputs_pytorch=input_torch)\n",
+    "del baseline_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!/usr/src/tensorrt/bin/trtexec --onnx=baseline.onnx --shapes=input_ids:1x384,attention_mask:1x384 --fp16 --workspace=6000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del baseline_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pytorch baseline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [
+    "whPRbBNbIrIl",
+    "n9qywopnIrJH",
+    "7k8ge1L1IrJk"
+   ],
+   "name": "Copie de Text Classification on GLUE",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "0022faf286b44e858e638ccd5ded38b0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "023900ca566446eab5905b25b16a3de7": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "08286a6371584b4186014ecb5d5f164d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_2d3a08166846438db79b0f89314fe76a",
+      "placeholder": "​",
+      "style": "IPY_MODEL_d1ecc3d380fc4758b03190b23686a2f1",
+      "value": " 481/481 [00:00&lt;00:00, 10.9kB/s]"
+     }
+    },
+    "092db03992f24951b494fbb81da5b9d6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_994cf2338c7c4899952e25723445693c",
+       "IPY_MODEL_6aa2f5d46f1f454198d8e69517549ff1",
+       "IPY_MODEL_72b8f11065254e5ca488cd346b5add54"
+      ],
+      "layout": "IPY_MODEL_023900ca566446eab5905b25b16a3de7"
+     }
+    },
+    "0dab554959dc44b3b313ee8ae91ca88d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_e1f08cf954ae4aea818c90d893486c77",
+      "placeholder": "​",
+      "style": "IPY_MODEL_f01fdef82047471e8c1b780cae5379cc",
+      "value": " 420M/420M [00:13&lt;00:00, 33.6MB/s]"
+     }
+    },
+    "10678736bd534c63aebda414da01b4db": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "14648b8262944f5faac134a7c0184e47": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "154200a8bc0b44fe8d0419fd56c6539d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "15aae23369674f82888ed9fbd99739f2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "163146c2f23440bcbf782116a35b5684": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_cf9597523c024514b9b3e66bc77e3fa8",
+      "max": 440473133,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_cced5f1cccc2400a8fbfd7a6eaedc666",
+      "value": 440473133
+     }
+    },
+    "167874df55014291be95cd390b1e60d3": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "17b83e0d0fb947d7bf20319ff930e8fc": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_854cfd13416543fba8221093b903658b",
+      "placeholder": "​",
+      "style": "IPY_MODEL_cbbb20b5d01a4450bfb8dfbf8048d64f",
+      "value": "Downloading: 100%"
+     }
+    },
+    "17bd5357081d41c6b0161d63bd00820a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "196ffc99ad5a40109d9b1cfe12032b62": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "1bea379404df429b9852b62a938661ae": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a4c444f06c0847c09a44917084d3908d",
+      "max": 1,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_300f01e3547648f3983a83d3d3118c54",
+      "value": 1
+     }
+    },
+    "1da1d80871f545bbb21bf5a84d2120a0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_8585eab4b3fe4992bd7e7c4596e2483b",
+      "max": 570,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_7ec6da801d0d45c4bb80eeab5518e124",
+      "value": 570
+     }
+    },
+    "21ef195fa88f49c4a2c057f8028177a2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "26bc2038bed74279813ab5af09a2724c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0022faf286b44e858e638ccd5ded38b0",
+      "max": 456318,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_7ff32d18c9f0473893a6a6b2941c54b0",
+      "value": 456318
+     }
+    },
+    "28b7346a9b8c4b198dd9dbea1be013b6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "2d3a08166846438db79b0f89314fe76a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2eac6b4817e14d7fae396e6458b940fa": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_927ad6ade85a402594074fa90ab558c2",
+      "placeholder": "​",
+      "style": "IPY_MODEL_cae29b9c6d45412fab70977fcd0f3234",
+      "value": "Downloading: 100%"
+     }
+    },
+    "300f01e3547648f3983a83d3d3118c54": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "30646fa2c0dc494e9dbcbd4dc598410e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "360d6eb0e41543dba6d457912e32a77d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "37cda4cae81a4d94aa831fb40b5c3b26": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_56fd7584b0844590936519ec3851922e",
+      "placeholder": "​",
+      "style": "IPY_MODEL_5b1ad9f5d02c4b298a02ce6041692057",
+      "value": " 4/4 [00:00&lt;00:00,  5.97ba/s]"
+     }
+    },
+    "3bfff454943b4b04a12ec29bbe28e0aa": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_cedca6e55b84443e82f3d01471d61048",
+       "IPY_MODEL_a7d355f456eb4d3995dd91c5917a72c1",
+       "IPY_MODEL_b264b220d9c444bd9da46a7e6c8fd5ed"
+      ],
+      "layout": "IPY_MODEL_154200a8bc0b44fe8d0419fd56c6539d"
+     }
+    },
+    "3e7fbd1c0e534cb8abca18d1edfc9277": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "4320b12de9d14c459cc88319e2d7622a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "4552ee8ca6bd4a0b956651cc23f4ff3c": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4b13c3b3435f4689b29d48e0a35bebd6": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4e91efae49b64f038fd3fbfcfd2be510": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4fae966b76844c869cdea1e53891e26f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "54c0ad5ab737433190c4a824be128a48": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "561b1ede331a40c1a2bff9422e8eea0e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "56fd7584b0844590936519ec3851922e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "59418bbeb20547e5b5e1a5728262c757": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5b1ad9f5d02c4b298a02ce6041692057": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "5e2185bd6e4f4a10b89ac606868a43bd": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_2eac6b4817e14d7fae396e6458b940fa",
+       "IPY_MODEL_af16284f77594397a69ad0e322b5e736",
+       "IPY_MODEL_a20579a9e7364fb485d79bdc4feb54dc"
+      ],
+      "layout": "IPY_MODEL_f44d2beebfe44186b0ac8016e89e4b49"
+     }
+    },
+    "5f032f56105f463a8680aa2482d0b162": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "65017db07d7f4e798ede741cc92488f0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_86cc326e574a4fada7224e6f0c209e9a",
+       "IPY_MODEL_af5b646f89024c139c695a1f058fb772",
+       "IPY_MODEL_37cda4cae81a4d94aa831fb40b5c3b26"
+      ],
+      "layout": "IPY_MODEL_6fa74604c68543a38392fa0e1587f707"
+     }
+    },
+    "68c4c867096d41a78740fdee30edcadb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "6aa2f5d46f1f454198d8e69517549ff1": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7b483d17d1d14fdd922600f0c906fc2f",
+      "max": 1355863,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_4320b12de9d14c459cc88319e2d7622a",
+      "value": 1355863
+     }
+    },
+    "6d48e5ce9a854a3bb0506d774665f428": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_fbdb7c7250d846b2880005a9012c484b",
+      "placeholder": "​",
+      "style": "IPY_MODEL_17bd5357081d41c6b0161d63bd00820a",
+      "value": " 478M/478M [00:15&lt;00:00, 34.7MB/s]"
+     }
+    },
+    "6e54ce781ca54ad283911fa4774e3361": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "6e604307427a466cab51d50d363ee86d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "6fa74604c68543a38392fa0e1587f707": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "728a9dcc79824e1eb2bfa49d915a8f08": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d314c0bb87e04893b96de0e18766d3ab",
+      "placeholder": "​",
+      "style": "IPY_MODEL_fa35b3acd9ce4cb098fcd69bb405db00",
+      "value": "Downloading: 100%"
+     }
+    },
+    "72b8f11065254e5ca488cd346b5add54": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_10678736bd534c63aebda414da01b4db",
+      "placeholder": "​",
+      "style": "IPY_MODEL_14648b8262944f5faac134a7c0184e47",
+      "value": " 1.29M/1.29M [00:00&lt;00:00, 2.22MB/s]"
+     }
+    },
+    "7701ec898fd443f1b35b187aea3651e9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "78601982b0e04b80adaa502db2ef685a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_d6426fea2eda41dd9a31cb3f35b0877e",
+       "IPY_MODEL_163146c2f23440bcbf782116a35b5684",
+       "IPY_MODEL_0dab554959dc44b3b313ee8ae91ca88d"
+      ],
+      "layout": "IPY_MODEL_167874df55014291be95cd390b1e60d3"
+     }
+    },
+    "788badadfd834f61926a39a43ef1d517": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "7a75099f99054645bf3fc1b778dac7e6": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7b483d17d1d14fdd922600f0c906fc2f": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7bb3b69a2f814e60b0cec253c759a16b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d731cfb34124448bbd8baab3d27b75db",
+      "placeholder": "​",
+      "style": "IPY_MODEL_cbb3e9bf5d07406d9768a98a6f0b5b64",
+      "value": "100%"
+     }
+    },
+    "7c875ecd9cb54405a6c45969bcb4b4c6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "7d520bdde27742abb42803843721d101": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7ec6da801d0d45c4bb80eeab5518e124": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "7ff32d18c9f0473893a6a6b2941c54b0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "8399339998564d21ba5db6f0514c02c6": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "854cfd13416543fba8221093b903658b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "8585eab4b3fe4992bd7e7c4596e2483b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "86cc326e574a4fada7224e6f0c209e9a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_561b1ede331a40c1a2bff9422e8eea0e",
+      "placeholder": "​",
+      "style": "IPY_MODEL_28b7346a9b8c4b198dd9dbea1be013b6",
+      "value": "100%"
+     }
+    },
+    "87d85ac2d3104f68b99db880b1089638": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_728a9dcc79824e1eb2bfa49d915a8f08",
+       "IPY_MODEL_c815bfd265f4480298c39c76b9eaf770",
+       "IPY_MODEL_6d48e5ce9a854a3bb0506d774665f428"
+      ],
+      "layout": "IPY_MODEL_6e604307427a466cab51d50d363ee86d"
+     }
+    },
+    "8a11c8fed672470b8335dc575a4a220e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_93dbcc6d23a743bab0da8af6ee5e2825",
+      "max": 481,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_f8a0053903c64e75ac25eab5b24d5871",
+      "value": 481
+     }
+    },
+    "8defdddee0e64a20b101e6c50bd7c60b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_7bb3b69a2f814e60b0cec253c759a16b",
+       "IPY_MODEL_d25cca081db3469b80163d6707f5a37d",
+       "IPY_MODEL_f8abc3e44ae3428885aafbea2b37384c"
+      ],
+      "layout": "IPY_MODEL_f485d2b19ffa4585a1da20986f28af29"
+     }
+    },
+    "927ad6ade85a402594074fa90ab558c2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "93dbcc6d23a743bab0da8af6ee5e2825": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "969b6fdac1d6418d89a683db1e6ec6b2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "990482eebca2424bb5ecbd114007e02c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "994cf2338c7c4899952e25723445693c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d9a0852554284d36b6b121f579b06b41",
+      "placeholder": "​",
+      "style": "IPY_MODEL_c7bd52ef524c4d279dfcaa3aebe4a2c5",
+      "value": "Downloading: 100%"
+     }
+    },
+    "99e94791043b4499b06601f7524f9b14": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d5c8ff9e3bd849059fa7b30eab5fc940",
+      "placeholder": "​",
+      "style": "IPY_MODEL_196ffc99ad5a40109d9b1cfe12032b62",
+      "value": "Downloading: 100%"
+     }
+    },
+    "9bc6e14b912249e3b7d02f31bcc74667": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_969b6fdac1d6418d89a683db1e6ec6b2",
+      "placeholder": "​",
+      "style": "IPY_MODEL_6e54ce781ca54ad283911fa4774e3361",
+      "value": " 446k/446k [00:00&lt;00:00, 650kB/s]"
+     }
+    },
+    "a02624219ee84f50b1a3032eaa030a39": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "a0a2918e9772475cac51124b3b83fcaf": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a20579a9e7364fb485d79bdc4feb54dc": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4b13c3b3435f4689b29d48e0a35bebd6",
+      "placeholder": "​",
+      "style": "IPY_MODEL_d5d015711ae04d2f801577fc50af6c15",
+      "value": " 878k/878k [00:00&lt;00:00, 1.33MB/s]"
+     }
+    },
+    "a3e2c73d393d4e58a371f3da3dd80e6d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a4c444f06c0847c09a44917084d3908d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a51b461c062f4636bfa4b48823d0709b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a61d366d91c34697a55f62b754e1f3a5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a9b98fd93fcd4fc4a2b2aa88c82835d0",
+      "placeholder": "​",
+      "style": "IPY_MODEL_b8722dc10d4447fe9630cbf169260cc8",
+      "value": "100%"
+     }
+    },
+    "a7d355f456eb4d3995dd91c5917a72c1": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_5f032f56105f463a8680aa2482d0b162",
+      "max": 2,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_a02624219ee84f50b1a3032eaa030a39",
+      "value": 2
+     }
+    },
+    "a9b98fd93fcd4fc4a2b2aa88c82835d0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ac14ba24dcf3404db9fd303dbb24d7a5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_17b83e0d0fb947d7bf20319ff930e8fc",
+       "IPY_MODEL_1da1d80871f545bbb21bf5a84d2120a0",
+       "IPY_MODEL_c593f2e45e244637821cc5721788bf2c"
+      ],
+      "layout": "IPY_MODEL_4e91efae49b64f038fd3fbfcfd2be510"
+     }
+    },
+    "aecf7f063234416abf3f24766481cb89": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "af16284f77594397a69ad0e322b5e736": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7a75099f99054645bf3fc1b778dac7e6",
+      "max": 898823,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_30646fa2c0dc494e9dbcbd4dc598410e",
+      "value": 898823
+     }
+    },
+    "af5b646f89024c139c695a1f058fb772": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_21ef195fa88f49c4a2c057f8028177a2",
+      "max": 4,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_aecf7f063234416abf3f24766481cb89",
+      "value": 4
+     }
+    },
+    "b264b220d9c444bd9da46a7e6c8fd5ed": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_8399339998564d21ba5db6f0514c02c6",
+      "placeholder": "​",
+      "style": "IPY_MODEL_7701ec898fd443f1b35b187aea3651e9",
+      "value": " 2/2 [00:00&lt;00:00,  6.46ba/s]"
+     }
+    },
+    "b4d3f284fc4c4061b58d43a738f9bc78": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7d520bdde27742abb42803843721d101",
+      "placeholder": "​",
+      "style": "IPY_MODEL_68c4c867096d41a78740fdee30edcadb",
+      "value": "Downloading: 100%"
+     }
+    },
+    "b6be028de2ae4ff691538eedb33793af": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_b4d3f284fc4c4061b58d43a738f9bc78",
+       "IPY_MODEL_8a11c8fed672470b8335dc575a4a220e",
+       "IPY_MODEL_08286a6371584b4186014ecb5d5f164d"
+      ],
+      "layout": "IPY_MODEL_a3e2c73d393d4e58a371f3da3dd80e6d"
+     }
+    },
+    "b8722dc10d4447fe9630cbf169260cc8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "bbe3a471efb04ea8b5aabc4be819d585": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_a61d366d91c34697a55f62b754e1f3a5",
+       "IPY_MODEL_1bea379404df429b9852b62a938661ae",
+       "IPY_MODEL_c801e1727de44b67aa7cb1c3d970e1fe"
+      ],
+      "layout": "IPY_MODEL_59418bbeb20547e5b5e1a5728262c757"
+     }
+    },
+    "be4affe852b348de8fe1362582b08da9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_99e94791043b4499b06601f7524f9b14",
+       "IPY_MODEL_26bc2038bed74279813ab5af09a2724c",
+       "IPY_MODEL_9bc6e14b912249e3b7d02f31bcc74667"
+      ],
+      "layout": "IPY_MODEL_c6c100b71f26405fb960598feb5eee03"
+     }
+    },
+    "c593f2e45e244637821cc5721788bf2c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_c92a19dfa84142af91522bc22f21fca6",
+      "placeholder": "​",
+      "style": "IPY_MODEL_990482eebca2424bb5ecbd114007e02c",
+      "value": " 570/570 [00:00&lt;00:00, 13.1kB/s]"
+     }
+    },
+    "c6c100b71f26405fb960598feb5eee03": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "c7bd52ef524c4d279dfcaa3aebe4a2c5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "c801e1727de44b67aa7cb1c3d970e1fe": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4552ee8ca6bd4a0b956651cc23f4ff3c",
+      "placeholder": "​",
+      "style": "IPY_MODEL_7c875ecd9cb54405a6c45969bcb4b4c6",
+      "value": " 1/1 [00:00&lt;00:00,  7.22ba/s]"
+     }
+    },
+    "c815bfd265f4480298c39c76b9eaf770": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_15aae23369674f82888ed9fbd99739f2",
+      "max": 501200538,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_3e7fbd1c0e534cb8abca18d1edfc9277",
+      "value": 501200538
+     }
+    },
+    "c92a19dfa84142af91522bc22f21fca6": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "cae29b9c6d45412fab70977fcd0f3234": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "cbb3e9bf5d07406d9768a98a6f0b5b64": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "cbbb20b5d01a4450bfb8dfbf8048d64f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "cced5f1cccc2400a8fbfd7a6eaedc666": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "cedca6e55b84443e82f3d01471d61048": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a0a2918e9772475cac51124b3b83fcaf",
+      "placeholder": "​",
+      "style": "IPY_MODEL_4fae966b76844c869cdea1e53891e26f",
+      "value": "100%"
+     }
+    },
+    "cf9597523c024514b9b3e66bc77e3fa8": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d1ecc3d380fc4758b03190b23686a2f1": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "d25cca081db3469b80163d6707f5a37d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_360d6eb0e41543dba6d457912e32a77d",
+      "max": 3,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_788badadfd834f61926a39a43ef1d517",
+      "value": 3
+     }
+    },
+    "d314c0bb87e04893b96de0e18766d3ab": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d5c8ff9e3bd849059fa7b30eab5fc940": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d5d015711ae04d2f801577fc50af6c15": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "d6426fea2eda41dd9a31cb3f35b0877e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a51b461c062f4636bfa4b48823d0709b",
+      "placeholder": "​",
+      "style": "IPY_MODEL_f651eecbb6d44c24820cf6fe5ab92e7b",
+      "value": "Downloading: 100%"
+     }
+    },
+    "d731cfb34124448bbd8baab3d27b75db": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d9a0852554284d36b6b121f579b06b41": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "e1f08cf954ae4aea818c90d893486c77": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "f01fdef82047471e8c1b780cae5379cc": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "f237ed04039945e9aa224d1b9d04e1b5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "f44d2beebfe44186b0ac8016e89e4b49": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "f485d2b19ffa4585a1da20986f28af29": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "f651eecbb6d44c24820cf6fe5ab92e7b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "f8a0053903c64e75ac25eab5b24d5871": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "f8abc3e44ae3428885aafbea2b37384c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_54c0ad5ab737433190c4a824be128a48",
+      "placeholder": "​",
+      "style": "IPY_MODEL_f237ed04039945e9aa224d1b9d04e1b5",
+      "value": " 3/3 [00:00&lt;00:00, 52.79it/s]"
+     }
+    },
+    "fa35b3acd9ce4cb098fcd69bb405db00": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "fbdb7c7250d846b2880005a9012c484b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
\ No newline at end of file

From ea74d9b1919d6e0d0f9f6281c82f9fd9d4c9417e Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 1 Dec 2021 23:09:52 +0100
Subject: [PATCH 03/15] add dependency

---
 requirements_gpu.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements_gpu.txt b/requirements_gpu.txt
index e6f536b2..7a17eabe 100644
--- a/requirements_gpu.txt
+++ b/requirements_gpu.txt
@@ -8,3 +8,4 @@ nvidia-tensorrt
 onnx_graphsurgeon
 polygraphy
 triton-model-analyzer
+pytorch-quantization

From d173186ad257840f4fa9ddc22d80e2088b3a47d2 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Sat, 4 Dec 2021 17:51:35 +0100
Subject: [PATCH 04/15] qdqroberta

---
 roberta_classic.py                            |  144 ++
 .../QDQModels/QDQRoberta.py                   | 1635 +++++++++++++++++
 src/transformer_deploy/QDQModels/__init__.py  |   13 +
 3 files changed, 1792 insertions(+)
 create mode 100644 roberta_classic.py
 create mode 100644 src/transformer_deploy/QDQModels/QDQRoberta.py
 create mode 100644 src/transformer_deploy/QDQModels/__init__.py

diff --git a/roberta_classic.py b/roberta_classic.py
new file mode 100644
index 00000000..7afa97b0
--- /dev/null
+++ b/roberta_classic.py
@@ -0,0 +1,144 @@
+from datasets import load_dataset, load_metric
+from tqdm import tqdm
+from transformer_deploy.QDQModels.QDQRoberta import QDQRobertaForSequenceClassification
+
+from transformers import AutoTokenizer
+import pytorch_quantization.nn as quant_nn
+from pytorch_quantization.tensor_quant import QuantDescriptor
+import numpy as np
+
+import torch
+from transformers import (
+    AutoModelForSequenceClassification,
+    PreTrainedModel,
+    TrainingArguments,
+    Trainer,
+    IntervalStrategy,
+)
+from pytorch_quantization import calib
+
+num_labels = 3
+model_checkpoint = "roberta-base"
+batch_size = 32
+validation_key = "validation_matched"
+dataset = load_dataset("glue", "mnli")
+metric = load_metric('glue', "mnli")
+nb_step = 1000
+training_strategy = IntervalStrategy.STEPS
+
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
+
+
+def preprocess_function(examples):
+    return tokenizer(examples["premise"], examples["hypothesis"],
+                     truncation=True,
+                     padding="max_length",
+                     max_length=256)
+
+
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    return metric.compute(predictions=predictions, references=labels)
+
+
+encoded_dataset = dataset.map(preprocess_function, batched=True)
+
+args = TrainingArguments(
+    f"{model_checkpoint}-finetuned",
+    evaluation_strategy=training_strategy,
+    eval_steps=nb_step,
+    logging_steps=nb_step,
+    save_steps=nb_step,
+    save_strategy=training_strategy,
+    learning_rate=1e-5,  # 7.5e-6 https://github.com/pytorch/fairseq/issues/2057#issuecomment-643674771
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size * 2,
+    num_train_epochs=1,
+    fp16=True,
+    group_by_length=False,
+    weight_decay=0.01,
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy",
+)
+
+model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
+model_roberta = model_roberta.cuda()
+
+trainer = Trainer(
+    model_roberta,
+    args,
+    train_dataset=encoded_dataset["train"],
+    eval_dataset=encoded_dataset[validation_key],
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics
+)
+print(trainer.evaluate())
+# {'eval_loss': 0.3559744358062744, 'eval_accuracy': 0.8655119714722364, 'eval_runtime': 19.6678, 'eval_samples_per_second': 499.04, 'eval_steps_per_second': 7.83, 'epoch': 0.98}
+trainer.train()
+trainer.save_model("roberta-model")
+del model_roberta
+del trainer
+
+input_desc = QuantDescriptor(num_bits=8, calib_method="histogram")
+# below we do per-channel quantization for weights, set axis to None to get a per tensor calibration
+weight_desc = QuantDescriptor(num_bits=8, axis=(0,))
+quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
+quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
+
+model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained("roberta-model")
+model_roberta_q = model_roberta_q.cuda()
+# Find the TensorQuantizer and enable calibration
+for name, module in tqdm(model_roberta_q.named_modules()):
+    if isinstance(module, quant_nn.TensorQuantizer):
+        if module._calibrator is not None:
+            module.disable_quant()
+            module.enable_calib()
+        else:
+            module.disable()
+
+with torch.no_grad():
+    for start_index in tqdm(range(0, 4*batch_size, batch_size)):
+        end_index = start_index + batch_size
+        data = encoded_dataset["train"][start_index:end_index]
+        input_torch = {k: torch.tensor(list(v), dtype=torch.long, device="cuda")
+                       for k, v in data.items() if k in ["input_ids", "attention_mask", "token_type_ids"]}
+        model_roberta_q(**input_torch)
+
+
+# Finalize calibration
+for name, module in model_roberta_q.named_modules():
+    if isinstance(module, quant_nn.TensorQuantizer):
+        if module._calibrator is not None:
+            if isinstance(module._calibrator, calib.MaxCalibrator):
+                module.load_calib_amax()
+            else:
+                module.load_calib_amax("percentile", percentile=99.99)
+            module.enable_quant()
+            module.disable_calib()
+        else:
+            module.enable()
+
+model_roberta_q.cuda()
+
+model_roberta_q.save_pretrained("roberta-trained-quantized")
+del model_roberta_q
+
+model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained("roberta-trained-quantized", num_labels=num_labels)
+model_roberta_q = model_roberta_q.cuda()
+
+args.learning_rate /= 10
+print(f"LR: {args.learning_rate}")
+trainer = Trainer(
+    model_roberta_q,
+    args,
+    train_dataset=encoded_dataset["train"],
+    eval_dataset=encoded_dataset[validation_key],
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics
+)
+print(trainer.evaluate())
+# {'eval_loss': 0.38076257705688477, 'eval_accuracy': 0.8552215995924605, 'eval_runtime': 46.9577, 'eval_samples_per_second': 209.018, 'eval_steps_per_second': 3.28}
+trainer.train()
+print(trainer.evaluate())
+model_roberta_q.save_pretrained("roberta-in-bert-trained-quantized-retrained")
diff --git a/src/transformer_deploy/QDQModels/QDQRoberta.py b/src/transformer_deploy/QDQModels/QDQRoberta.py
new file mode 100644
index 00000000..5f6b6369
--- /dev/null
+++ b/src/transformer_deploy/QDQModels/QDQRoberta.py
@@ -0,0 +1,1635 @@
+#  Copyright 2021, Lefebvre Sarrut Services
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# copied from Hugging Face transformers library
+# modified parts (outside imports) are preceded by -> # QDQ change below
+
+"""PyTorch RoBERTa model. """
+import math
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import RobertaConfig
+
+from transformers.activations import ACT2FN, gelu
+from transformers.file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+
+from pytorch_quantization import nn as quant_nn
+from pytorch_quantization.nn.modules.tensor_quantizer import TensorQuantizer
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "roberta-base"
+_CONFIG_FOR_DOC = "RobertaConfig"
+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
+ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "roberta-base",
+    "roberta-large",
+    "roberta-large-mnli",
+    "distilroberta-base",
+    "roberta-base-openai-detector",
+    "roberta-large-openai-detector",
+    # See all RoBERTa models at https://huggingface.co/models?filter=roberta
+]
+
+
+class RobertaEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            # TODO here?
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Roberta
+class RobertaSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        # QDQ change below
+        self.query = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+        self.key = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+        self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+        # QDQ change below
+        self.matmul_q_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_k_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_v_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_a_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # QDQ change below
+        attention_scores = torch.matmul(
+            self.matmul_q_input_quantizer(query_layer), self.matmul_k_input_quantizer(key_layer.transpose(-1, -2))
+        )
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        # QDQ change below
+        context_layer = torch.matmul(
+            self.matmul_a_input_quantizer(attention_probs), self.matmul_v_input_quantizer(value_layer)
+        )
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class RobertaSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # QDQ change below
+        # Quantize Linear layer
+        self.dense = quant_nn.QuantLinear(config.hidden_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # QDQ change below
+        # Quantize the inputs to the residual add
+        self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # QDQ change below
+        # Quantize the inputs to the residual add
+        add_local = self.add_local_input_quantizer(hidden_states)
+        add_residual = self.add_residual_input_quantizer(input_tensor)
+        hidden_states = self.LayerNorm(add_local + add_residual)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+class RobertaAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = RobertaSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = RobertaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class RobertaIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # QDQ change below
+        self.dense = quant_nn.QuantLinear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class RobertaOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # QDQ change below
+        # Quantize Linear layer
+        self.dense = quant_nn.QuantLinear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # QDQ change below
+        # Quantize the inputs to the residual add
+        self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # QDQ change below
+        # Quantize the inputs to the residual add
+        add_local = self.add_local_input_quantizer(hidden_states)
+        add_residual = self.add_residual_input_quantizer(input_tensor)
+        hidden_states = self.LayerNorm(add_local + add_residual)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Roberta
+class RobertaLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RobertaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = RobertaAttention(config, position_embedding_type="absolute")
+        self.intermediate = RobertaIntermediate(config)
+        self.output = RobertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Roberta
+class RobertaEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class RobertaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class RobertaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RobertaConfig
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, RobertaEncoder):
+            module.gradient_checkpointing = value
+
+    def update_keys_to_ignore(self, config, del_keys_to_ignore):
+        """Remove some keys from ignore list"""
+        if not config.tie_word_embeddings:
+            # must make a new list, or the class variable gets modified!
+            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
+            self._keys_to_ignore_on_load_missing = [
+                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
+            ]
+
+
+ROBERTA_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.RobertaTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaModel(RobertaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+
+    .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762
+
+    """
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RobertaEmbeddings(config)
+        self.encoder = RobertaEncoder(config)
+
+        self.pooler = RobertaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning. """, ROBERTA_START_DOCSTRING
+)
+class RobertaForCausalLM(RobertaPreTrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaLMHead(config)
+
+        # The LM head weights require special treatment only when they are tied with the word embeddings
+        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig
+            >>> import torch
+
+            >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+            >>> config = RobertaConfig.from_pretrained("roberta-base")
+            >>> config.is_decoder = True
+            >>> model = RobertaForCausalLM.from_pretrained('roberta-base', config=config)
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
+class RobertaForMaskedLM(RobertaPreTrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaLMHead(config)
+
+        # The LM head weights require special treatment only when they are tied with the word embeddings
+        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaLMHead(nn.Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+    """
+    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class QDQRobertaForSequenceClassification(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.classifier = RobertaClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaForMultipleChoice(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = RobertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaForTokenClassification(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaForQuestionAnswering(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # QDQ change below
+    # return torch.zeros(input_ids.shape, dtype=torch.long, device="cuda")
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    # int() -> float() because of a limitations in cumsum operator implementation in TensorRT
+    mask = input_ids.ne(padding_idx).float()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
diff --git a/src/transformer_deploy/QDQModels/__init__.py b/src/transformer_deploy/QDQModels/__init__.py
new file mode 100644
index 00000000..d754dd37
--- /dev/null
+++ b/src/transformer_deploy/QDQModels/__init__.py
@@ -0,0 +1,13 @@
+#  Copyright 2021, Lefebvre Sarrut Services
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.

From 2d21bc45f2e0720256a2fb9be03954aca0623164 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Mon, 6 Dec 2021 22:58:51 +0100
Subject: [PATCH 05/15] update quantization notebook

---
 demo/text_classification_quantization.ipynb   | 4955 +++++------------
 roberta_classic.py                            |  110 +-
 .../QDQModels/QDQRoberta.py                   |   10 +-
 src/transformer_deploy/backends/trt_utils.py  |    8 +-
 src/transformer_deploy/convert.py             |   29 +-
 5 files changed, 1590 insertions(+), 3522 deletions(-)

diff --git a/demo/text_classification_quantization.ipynb b/demo/text_classification_quantization.ipynb
index 1a49b73d..26fe5ca5 100644
--- a/demo/text_classification_quantization.ipynb
+++ b/demo/text_classification_quantization.ipynb
@@ -11,14 +11,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Recently, Nvidia added to Hugging Face `transformer` library a new model called `QDQBert`.\n",
-    "The single purpose of this model is to show how to add GPU quantization to vanilla Bert.\n",
-    "There are also some demo scripts to demonstrate the use of the model on SQuaD task.\n",
+    "Quantization is one of the most effective and generic approach to make model inference faster.\n",
+    "Basically it replaces float numbers generally encoded in 16 or 32 bits by integers encoded in 8 bits or less:\n",
+    "\n",
+    "* it takes less memory\n",
+    "* computation is easier / faster\n",
     "\n",
     "**GPU quantization is a way to double the inference speed of your GPU**.\n",
     "It can be applied to any model in theory, and unlike distillation, if done well, it should not decrease your model accuracy.\n",
     "\n",
-    "Unfortunately, these extreme perforamances are not easy to get, it requires some good knowledge of TensorRT API, ONNX export, or quantization process. The purpose of this tutorial is to show a good enough process to perform quantization.\n",
+    "The purpose of this tutorial is to show 2 processes to perform quantization on most `transformer` architecture.\n",
     "\n",
     "## What is int-8 quantization?\n",
     "\n",
@@ -36,7 +38,7 @@
     "\n",
     "Basically, we know that by converting a FP32 to an int-8 and its scale, we will lose some information, and the goal of the calibration is to minimize this loss.\n",
     "\n",
-    "If in a matrix, values go from -1.5 to +2, it may be encoded as an integer taking value from -128 to +127, associated to a scale of 64 (2*64=128)\n",
+    "If in a matrix, values go from -1.5 to +2, it may be encoded as an integer taking value from -127 to +127, associated to a scale of 64 (2*64=128)\n",
     "\n",
     "\n",
     "[A good documentation on quantization](https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf)\n",
@@ -44,6 +46,16 @@
     "\n",
     "## Why a dedicated tutorial?\n",
     "\n",
+    "CPU quantization is supported out of the box by `Pytorch` or ONNX Runtime.\n",
+    "GPU quantization on the other side requires specific tools and process to be applied.\n",
+    "\n",
+    "In the specific case of `transformer` models, right now (december 2021), the only way shown by Nvidia is to build manually the graph of your model in `TensorRT`. This is a low level approach, based on GPU capacity knowledge (which operator are supported, etc.). It's certainly out of reach of most NLP practitioners.\n",
+    "\n",
+    "Hopefully, Nvidia recently added to Hugging Face `transformer` library a new model called `QDQBert`.\n",
+    "Basically, it's a vanilla `Bert` architecture which supports int-8 quantization.\n",
+    "It doesn't support any other architecture out of the box, like `Albert`, `Roberta`, or `Electra`.\n",
+    "The Nvidia demo is dedicated to SQuaD task.\n",
+    "\n",
     "The code from Nvidia only supports out of the box vanilla `Bert` model (and not similar models, like RoBerta & co).\n",
     "The demo from Nvidia is on the SQuaD task, it's cool but it makes the code a lot less clear that needed.\n",
     "\n",
@@ -51,14 +63,20 @@
     "\n",
     "* how to perform GPU quantization on **any** transformer model (not just Bert) using a simple trick\n",
     "* how to to apply quantization to a common task like classification (which is easier to understand than question answering)\n",
-    "* measure performance gain (latency)"
+    "* measure performance gain (latency)\n",
+    "\n",
+    "## ToC\n",
+    "\n",
+    "### [Dependencies](#Dependencies-installation)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Dependencies installation"
+    "## Project setup\n",
+    "\n",
+    "### Dependencies installation"
    ]
   },
   {
@@ -82,6 +100,13 @@
     "#! pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check the GPU is enabled and usable."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -97,7 +122,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Wed Dec  1 18:59:19 2021       \r\n",
+      "Mon Dec  6 17:39:28 2021       \r\n",
       "+-----------------------------------------------------------------------------+\r\n",
       "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\r\n",
       "|-------------------------------+----------------------+----------------------+\r\n",
@@ -106,7 +131,7 @@
       "|                               |                      |               MIG M. |\r\n",
       "|===============================+======================+======================|\r\n",
       "|   0  NVIDIA GeForce ...  On   | 00000000:03:00.0  On |                  N/A |\r\n",
-      "| 79%   60C    P8    52W / 350W |    311MiB / 24267MiB |      2%      Default |\r\n",
+      "| 70%   55C    P8    47W / 350W |    304MiB / 24267MiB |     15%      Default |\r\n",
       "|                               |                      |                  N/A |\r\n",
       "+-------------------------------+----------------------+----------------------+\r\n",
       "                                                                               \r\n",
@@ -115,69 +140,150 @@
       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\r\n",
       "|        ID   ID                                                   Usage      |\r\n",
       "|=============================================================================|\r\n",
-      "|    0   N/A  N/A      1975      G   /usr/lib/xorg/Xorg                188MiB |\r\n",
-      "|    0   N/A  N/A      7865      G   /usr/bin/gnome-shell               40MiB |\r\n",
-      "|    0   N/A  N/A     35082      G   ...AAAAAAAAA= --shared-files       41MiB |\r\n",
-      "|    0   N/A  N/A    161588      G   ..._49620.log --shared-files       12MiB |\r\n",
-      "|    0   N/A  N/A    706814      G   ...AAAAAAAAA= --shared-files       25MiB |\r\n",
+      "|    0   N/A  N/A      1636      G   /usr/lib/xorg/Xorg                162MiB |\r\n",
+      "|    0   N/A  N/A      7876      G   /usr/bin/gnome-shell               45MiB |\r\n",
+      "|    0   N/A  N/A     21136      G   ...AAAAAAAAA= --shared-files       20MiB |\r\n",
+      "|    0   N/A  N/A    129021      G   ...AAAAAAAAA= --shared-files       38MiB |\r\n",
+      "|    0   N/A  N/A   2438985      G   ...359197.log --shared-files       33MiB |\r\n",
       "+-----------------------------------------------------------------------------+\r\n"
      ]
     }
    ],
    "source": [
-    "# check that the GPU is enabled\n",
     "! nvidia-smi"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 3,
    "metadata": {
-    "id": "rEJBSTyZIrIb"
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000,
+     "referenced_widgets": [
+      "ac14ba24dcf3404db9fd303dbb24d7a5",
+      "4e91efae49b64f038fd3fbfcfd2be510",
+      "17b83e0d0fb947d7bf20319ff930e8fc",
+      "1da1d80871f545bbb21bf5a84d2120a0",
+      "c593f2e45e244637821cc5721788bf2c",
+      "cbbb20b5d01a4450bfb8dfbf8048d64f",
+      "854cfd13416543fba8221093b903658b",
+      "7ec6da801d0d45c4bb80eeab5518e124",
+      "8585eab4b3fe4992bd7e7c4596e2483b",
+      "990482eebca2424bb5ecbd114007e02c",
+      "c92a19dfa84142af91522bc22f21fca6",
+      "78601982b0e04b80adaa502db2ef685a",
+      "167874df55014291be95cd390b1e60d3",
+      "d6426fea2eda41dd9a31cb3f35b0877e",
+      "163146c2f23440bcbf782116a35b5684",
+      "0dab554959dc44b3b313ee8ae91ca88d",
+      "f651eecbb6d44c24820cf6fe5ab92e7b",
+      "a51b461c062f4636bfa4b48823d0709b",
+      "cced5f1cccc2400a8fbfd7a6eaedc666",
+      "cf9597523c024514b9b3e66bc77e3fa8",
+      "f01fdef82047471e8c1b780cae5379cc",
+      "e1f08cf954ae4aea818c90d893486c77"
+     ]
+    },
+    "id": "KPMoLPBn_1vN",
+    "outputId": "58dca4e7-fc5c-4fd1-a8d4-755aa1e956cb"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from typing import Dict, OrderedDict, List\n",
+    "import torch\n",
+    "from torch import Tensor\n",
+    "from transformers import (\n",
+    "    AutoModelForSequenceClassification,\n",
+    "    PreTrainedModel,\n",
+    "    QDQBertForSequenceClassification,\n",
+    "    BertForSequenceClassification,\n",
+    "    TrainingArguments,\n",
+    "    Trainer,\n",
+    "    IntervalStrategy,\n",
+    ")\n",
+    "from transformer_deploy.QDQModels.QDQRoberta import QDQRobertaForSequenceClassification\n",
+    "import pytorch_quantization.nn as quant_nn\n",
+    "from pytorch_quantization.tensor_quant import QuantDescriptor\n",
+    "from pytorch_quantization import calib\n",
+    "import logging\n",
+    "import transformers\n",
+    "import datasets\n",
+    "from transformer_deploy.backends.trt_utils import build_engine, get_binding_idxs, infer_tensorrt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
    },
+   "outputs": [],
    "source": [
-    "# Fine-tuning a model on a text classification task"
+    "from pycuda._driver import Stream\n",
+    "import tensorrt as trt\n",
+    "from tensorrt.tensorrt import IExecutionContext, Logger, Runtime\n",
+    "import pycuda.autoinit\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This part is inspired from [official Notebooks from Hugging Face](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)."
+    "Set logging to `error` to make the `notebook` more readable on Github."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "id": "YZbiBDuGIrId"
-   },
+   "execution_count": 5,
+   "metadata": {},
    "outputs": [],
    "source": [
-    "GLUE_TASKS = [\"cola\", \"mnli\", \"mnli-mm\", \"mrpc\", \"qnli\", \"qqp\", \"rte\", \"sst2\", \"stsb\", \"wnli\"]"
+    "log_level = logging.ERROR\n",
+    "logging.getLogger().setLevel(log_level)\n",
+    "datasets.utils.logging.set_verbosity(log_level)\n",
+    "transformers.utils.logging.set_verbosity(log_level)\n",
+    "transformers.utils.logging.enable_default_handler()\n",
+    "transformers.utils.logging.enable_explicit_format()\n",
+    "trt_logger: Logger = trt.Logger(trt.Logger.ERROR)\n",
+    "transformers.logging.set_verbosity_error()"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 4,
+   "cell_type": "markdown",
    "metadata": {
-    "id": "zVvslsfMIrIh"
+    "id": "rEJBSTyZIrIb"
    },
-   "outputs": [],
    "source": [
-    "task = \"mnli\"\n",
-    "num_labels = 3 if task.startswith(\"mnli\") else 1 if task==\"stsb\" else 2\n",
-    "model_checkpoint = \"roberta-base\"\n",
-    "batch_size = 32\n",
-    "validation_key = \"validation_mismatched\" if task == \"mnli-mm\" else \"validation_matched\" if task == \"mnli\" else \"validation\""
+    "### Download data"
    ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This part is inspired from an [official Notebooks from Hugging Face](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {
-    "id": "whPRbBNbIrIl"
+    "id": "zVvslsfMIrIh"
    },
+   "outputs": [],
    "source": [
-    "### Loading the dataset"
+    "task = \"mnli\"\n",
+    "num_labels = 3\n",
+    "model_checkpoint = \"roberta-base\"\n",
+    "batch_size = 32\n",
+    "max_seq_len = 256\n",
+    "validation_key = \"validation_matched\""
    ]
   },
   {
@@ -191,22 +297,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {
     "id": "IreSlFmlIrIm"
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Reusing dataset glue (/home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
-     ]
-    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d786beb2d0d2475f80ba9b915e2500a9",
+       "model_id": "18466bdd0e5b4e819e3bdadfa574eaa2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -244,18 +343,16 @@
        "})"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "from datasets import load_dataset, load_metric\n",
-    "import datasets\n",
     "\n",
-    "actual_task = \"mnli\" if task == \"mnli-mm\" else task\n",
-    "dataset = load_dataset(\"glue\", actual_task)\n",
-    "metric = load_metric('glue', actual_task)\n",
+    "dataset = load_dataset(\"glue\", task)\n",
+    "metric = load_metric('glue', task)\n",
     "dataset"
    ]
   },
@@ -286,7 +383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -347,75 +444,6 @@
     "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "Vl6IidfdIrJK"
-   },
-   "source": [
-    "We pass along `use_fast=True` to the call above to use one of the fast tokenizers (backed by Rust) from the 🤗 Tokenizers library. Those fast tokenizers are available for almost all models, but if you got an error with the previous call, remove that argument."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "id": "fyGdtK9oIrJM"
-   },
-   "outputs": [],
-   "source": [
-    "task_to_keys = {\n",
-    "    \"cola\": (\"sentence\", None),\n",
-    "    \"mnli\": (\"premise\", \"hypothesis\"),\n",
-    "    \"mnli-mm\": (\"premise\", \"hypothesis\"),\n",
-    "    \"mrpc\": (\"sentence1\", \"sentence2\"),\n",
-    "    \"qnli\": (\"question\", \"sentence\"),\n",
-    "    \"qqp\": (\"question1\", \"question2\"),\n",
-    "    \"rte\": (\"sentence1\", \"sentence2\"),\n",
-    "    \"sst2\": (\"sentence\", None),\n",
-    "    \"stsb\": (\"sentence1\", \"sentence2\"),\n",
-    "    \"wnli\": (\"sentence1\", \"sentence2\"),\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "xbqtC4MrIrJO"
-   },
-   "source": [
-    "We can double check it does work on our current dataset:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "19GG646uIrJO",
-    "outputId": "b9d1e5e8-21ca-43ea-85c6-f4315d50d96e"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sentence 1: Conceptually cream skimming has two basic dimensions - product and geography.\n",
-      "Sentence 2: Product and geography are what make cream skimming work. \n"
-     ]
-    }
-   ],
-   "source": [
-    "sentence1_key, sentence2_key = task_to_keys[task]\n",
-    "if sentence2_key is None:\n",
-    "    print(f\"Sentence: {dataset['train'][0][sentence1_key]}\")\n",
-    "else:\n",
-    "    print(f\"Sentence 1: {dataset['train'][0][sentence1_key]}\")\n",
-    "    print(f\"Sentence 2: {dataset['train'][0][sentence2_key]}\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -434,9 +462,7 @@
    "outputs": [],
    "source": [
     "def preprocess_function(examples):\n",
-    "    if sentence2_key is None:\n",
-    "        return tokenizer(examples[sentence1_key], truncation=True, padding=\"max_length\", max_length=256)\n",
-    "    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding=\"max_length\", max_length=256)"
+    "    return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True, padding=\"max_length\", max_length=max_seq_len)"
    ]
   },
   {
@@ -482,21 +508,10 @@
      ]
     },
     "id": "DDtsaJeVIrJT",
-    "outputId": "0eeb1cb2-e308-493b-807e-532eeae5f4fe"
+    "outputId": "0eeb1cb2-e308-493b-807e-532eeae5f4fe",
+    "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1f1a13d917d99a50.arrow\n",
-      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9b9dbd19d82c4713.arrow\n",
-      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b37b7241dc97daf7.arrow\n",
-      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-f78b5169435f1ed4.arrow\n",
-      "Loading cached processed dataset at /home/geantvert/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-87c8e6fc7a3e0678.arrow\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "encoded_dataset = dataset.map(preprocess_function, batched=True)"
    ]
@@ -507,7 +522,7 @@
     "id": "545PP3o8IrJV"
    },
    "source": [
-    "## Fine-tuning the model"
+    "## Fine-tuning model"
    ]
   },
   {
@@ -516,71 +531,14 @@
     "id": "FBiW8UpKIrJW"
    },
    "source": [
-    "Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about sentence classification, we use the `AutoModelForSequenceClassification` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us. The only thing we have to specify is the number of labels for our problem (which is always 2, except for STS-B which is a regression problem and MNLI where we have 3 labels):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 1000,
-     "referenced_widgets": [
-      "ac14ba24dcf3404db9fd303dbb24d7a5",
-      "4e91efae49b64f038fd3fbfcfd2be510",
-      "17b83e0d0fb947d7bf20319ff930e8fc",
-      "1da1d80871f545bbb21bf5a84d2120a0",
-      "c593f2e45e244637821cc5721788bf2c",
-      "cbbb20b5d01a4450bfb8dfbf8048d64f",
-      "854cfd13416543fba8221093b903658b",
-      "7ec6da801d0d45c4bb80eeab5518e124",
-      "8585eab4b3fe4992bd7e7c4596e2483b",
-      "990482eebca2424bb5ecbd114007e02c",
-      "c92a19dfa84142af91522bc22f21fca6",
-      "78601982b0e04b80adaa502db2ef685a",
-      "167874df55014291be95cd390b1e60d3",
-      "d6426fea2eda41dd9a31cb3f35b0877e",
-      "163146c2f23440bcbf782116a35b5684",
-      "0dab554959dc44b3b313ee8ae91ca88d",
-      "f651eecbb6d44c24820cf6fe5ab92e7b",
-      "a51b461c062f4636bfa4b48823d0709b",
-      "cced5f1cccc2400a8fbfd7a6eaedc666",
-      "cf9597523c024514b9b3e66bc77e3fa8",
-      "f01fdef82047471e8c1b780cae5379cc",
-      "e1f08cf954ae4aea818c90d893486c77"
-     ]
-    },
-    "id": "KPMoLPBn_1vN",
-    "outputId": "58dca4e7-fc5c-4fd1-a8d4-755aa1e956cb"
-   },
-   "outputs": [],
-   "source": [
-    "import pytorch_quantization.nn as quant_nn\n",
-    "from pytorch_quantization.tensor_quant import QuantDescriptor\n",
-    "import numpy as np\n",
-    "from tqdm.notebook import tqdm\n",
+    "Now that our data are ready, we can download the pretrained model and fine-tune it.\n",
     "\n",
-    "from typing import Dict, OrderedDict\n",
-    "import torch\n",
-    "from torch import Tensor\n",
-    "from transformers import (\n",
-    "    AutoModelForSequenceClassification,\n",
-    "    PreTrainedModel,\n",
-    "    QDQBertForSequenceClassification,\n",
-    "    BertForSequenceClassification,\n",
-    "    TrainingArguments,\n",
-    "    Trainer,\n",
-    "    IntervalStrategy,\n",
-    ")\n",
-    "import pytorch_quantization\n",
-    "from pytorch_quantization import calib\n",
-    "import shutil"
+    "We will also prepare some export function right now"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -594,7 +552,7 @@
     "            model_pytorch,  # model to optimize\n",
     "            args=(inputs_pytorch[\"input_ids\"], inputs_pytorch[\"attention_mask\"]),  # tuple of multiple inputs , inputs_pytorch[\"token_type_ids\"]\n",
     "            f=output_path,  # output path / file object\n",
-    "            opset_version=13,  # the ONNX version to use\n",
+    "            opset_version=13,  # the ONNX version to use, 13 is the first to support QDQ nodes\n",
     "            do_constant_folding=True,  # simplify model (replace constant expressions)\n",
     "            input_names=[\"input_ids\", \"attention_mask\"],  # input names \"token_type_ids\"\n",
     "            output_names=[\"model_output\"],  # output name\n",
@@ -607,13 +565,70 @@
     "            verbose=False,\n",
     "        )\n",
     "\n",
+    "\n",
     "def compute_metrics(eval_pred):\n",
     "    predictions, labels = eval_pred\n",
     "    if task != \"stsb\":\n",
     "        predictions = np.argmax(predictions, axis=1)\n",
     "    else:\n",
     "        predictions = predictions[:, 0]\n",
-    "    return metric.compute(predictions=predictions, references=labels)"
+    "    return metric.compute(predictions=predictions, references=labels)\n",
+    "\n",
+    "\n",
+    "def calibrate(model: PreTrainedModel, encoded_dataset, nb_sample: int=128) -> None:\n",
+    "    # Find the TensorQuantizer and enable calibration\n",
+    "    for name, module in tqdm(model.named_modules()):\n",
+    "        if isinstance(module, quant_nn.TensorQuantizer):\n",
+    "            if module._calibrator is not None:\n",
+    "                module.disable_quant()\n",
+    "                module.enable_calib()\n",
+    "            else:\n",
+    "                module.disable()\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        for start_index in tqdm(range(0, nb_sample, batch_size)):\n",
+    "            end_index = start_index + batch_size\n",
+    "            data = encoded_dataset[\"train\"][start_index:end_index]\n",
+    "            input_torch = {k: torch.tensor(list(v), dtype=torch.long, device=\"cpu\")\n",
+    "                           for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
+    "            model(**input_torch)\n",
+    "\n",
+    "\n",
+    "    # Finalize calibration\n",
+    "    for name, module in model.named_modules():\n",
+    "        if isinstance(module, quant_nn.TensorQuantizer):\n",
+    "            if module._calibrator is not None:\n",
+    "                if isinstance(module._calibrator, calib.MaxCalibrator):\n",
+    "                    module.load_calib_amax()\n",
+    "                else:\n",
+    "                    module.load_calib_amax(\"percentile\", percentile=99.99)\n",
+    "                module.enable_quant()\n",
+    "                module.disable_calib()\n",
+    "            else:\n",
+    "                module.enable()\n",
+    "\n",
+    "    model.cuda()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "runtime: Runtime = trt.Runtime(trt_logger)\n",
+    "profile_index = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Default parameters to be used for the training:"
    ]
   },
   {
@@ -622,27 +637,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "metric_name = \"pearson\" if task == \"stsb\" else \"matthews_correlation\" if task == \"cola\" else \"accuracy\"\n",
-    "model_name = model_checkpoint.split(\"/\")[-1]\n",
-    "\n",
     "nb_step = 1000\n",
     "strategy = IntervalStrategy.STEPS\n",
     "args = TrainingArguments(\n",
-    "    f\"{model_name}-finetuned-{task}\",\n",
+    "    f\"{model_checkpoint}-finetuned-{task}\",\n",
     "    evaluation_strategy = strategy,\n",
     "    eval_steps=nb_step,\n",
     "    logging_steps=nb_step,\n",
     "    save_steps=nb_step,\n",
     "    save_strategy = strategy,\n",
-    "    learning_rate=1e-5,  # 7.5e-6 https://github.com/pytorch/fairseq/issues/2057#issuecomment-643674771\n",
+    "    learning_rate=1e-5,\n",
     "    per_device_train_batch_size=batch_size,\n",
     "    per_device_eval_batch_size=batch_size*2,\n",
     "    num_train_epochs=1,\n",
     "    fp16=True,\n",
-    "    group_by_length=False,\n",
+    "    group_by_length=True,\n",
     "    weight_decay=0.01,\n",
     "    load_best_model_at_end=True,\n",
-    "    metric_for_best_model=metric_name,\n",
+    "    metric_for_best_model=\"accuracy\",\n",
+    "    report_to=[],\n",
     ")"
    ]
   },
@@ -650,24 +663,25 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Transplant weights from one model into Bert architecture\n",
+    "## Method 1: `Transplantation` of weights from a source model to an optimized architecture\n",
+    "\n",
+    "Transplantation idea is to export weights from one model and use them in another one.\n",
+    "In our case, the source are `Roberta` weights and the target is `Bert` archtecture which is highly optimized on `TensorRT` for GPU quantization.\n",
     "\n",
-    "First, you need to know that not all models are quantization compliant. The optimization engine (`TensorRT`) search for some patterns and will fail to opimize the model if it doesn't find them. It requires the code to be written in a certain way. For that reason we will try to reuse what works.\n",
+    "Indeed, not all models are quantization compliant. The optimization engine (`TensorRT`) search for some patterns and will fail to opimize the model if it doesn't find them. It requires the Pytorch code to be written in a certain way and use certain operations. For that reason, it's a good idea to reuse an architecture highly optimized.\n",
     "\n",
-    "We will leverage the fact that since Bert have been released, very few improvements have been brought to the transformer architecture (at least encoder only models).\n",
-    "Indeed, better model appeared, and most of the work has been done to improve the pretraining step.\n",
-    "So the idea will be to take the weights from those new models and put them inside Bert.\n",
+    "We will leverage the fact that since `Bert` have been released, very few improvements have been brought to the transformer architecture (at least for encoder only models).\n",
+    "Better models appeared, and most of the work has been done to improve the pretraining step (aka the weights).\n",
+    "So the idea will be to take the weights from those new models and put them inside `Bert` architecture.\n",
     "\n",
-    "The reason of this process is to avoid the modification of the source code of these others model.\n",
-    "Copy-pasting quantization part of QDQModel to another one is not hard (there are only few blocks modified) but would require some work on the user side, making quantization harder that it should be.\n",
-    "The process described below is not perfect but should work for most users.\n",
+    "The process described below should work for most users.\n",
     "\n",
     "**steps**:\n",
     "\n",
-    "* load Bert model\n",
+    "* load `Bert` model\n",
     "* retrieve layer/weight names\n",
-    "* load target model (here Roberta)\n",
-    "* replace weight/layer names with those from Roberta\n",
+    "* load target model (here `Roberta`)\n",
+    "* replace weight/layer names with those from `Roberta`\n",
     "* override the architecture name in model configuration\n",
     "\n",
     "If there is no 1 to 1 correspondance (it happens), try to keep at least embeddings and self attention. Of course, it's possible that if a model is very different, the transplant may cost some accuracy. In our experience, if your trainset is big enough it should not happen.\n"
@@ -676,61 +690,23 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "roberta-base\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(model_checkpoint)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
-      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias']\n",
-      "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "model_bert = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=num_labels)\n",
+    "model_bert: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=num_labels)\n",
     "bert_keys = list(model_bert.state_dict().keys())\n",
     "del model_bert\n",
     "\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
-    "model.save_pretrained(\"roberta-in-bert\")\n",
-    "del model\n",
+    "model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
+    "model_roberta.save_pretrained(\"roberta-in-bert\")\n",
+    "del model_roberta\n",
     "model_weights: OrderedDict[str, Tensor] = torch.load(\"roberta-in-bert/pytorch_model.bin\")\n",
     "\n",
-    "\n",
-    "# a too simple check\n",
-    "# IRL, check layer names and find a way to map self attention and embeddings from the original model to Bert\n",
-    "assert len(model_weights) == len(bert_keys)\n",
-    "\n",
+    "# Roberta -> Bert, there is 1 to 1 correspondance, for other models, you may need to create your own mapping.\n",
     "for bert_key in bert_keys:\n",
     "    # pop remove the first weights from the Ordered dict ...\n",
     "    _, weight = model_weights.popitem(last=False)\n",
@@ -739,19 +715,19 @@
     "\n",
     "# we re-export the weights\n",
     "torch.save(model_weights, \"roberta-in-bert/pytorch_model.bin\")\n",
-    "del model_weights\n"
+    "del model_weights"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We override the architecture name to make `transformers` believe it is Bert..."
+    "We override the architecture name to make `transformers` believe it is `Bert`..."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -785,155 +761,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.\n",
-      "max_steps is given, it will override any value given in num_train_epochs\n",
-      "Using amp half precision backend\n",
-      "The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running training *****\n",
-      "  Num examples = 392702\n",
-      "  Num Epochs = 1\n",
-      "  Instantaneous batch size per device = 32\n",
-      "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
-      "  Gradient Accumulation steps = 1\n",
-      "  Total optimization steps = 2000\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='2000' max='2000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [2000/2000 07:21, Epoch 0/1]\n",
-       "    </div>\n",
-       "    <table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: left;\">\n",
-       "      <th>Step</th>\n",
-       "      <th>Training Loss</th>\n",
-       "      <th>Validation Loss</th>\n",
-       "      <th>Accuracy</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>1000</td>\n",
-       "      <td>0.723500</td>\n",
-       "      <td>0.532824</td>\n",
-       "      <td>0.792562</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2000</td>\n",
-       "      <td>0.549100</td>\n",
-       "      <td>0.483588</td>\n",
-       "      <td>0.809068</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table><p>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-1000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-1000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-1000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-1000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-1000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-2000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-2000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-2000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-2000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-2000/special_tokens_map.json\n",
-      "\n",
-      "\n",
-      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
-      "\n",
-      "\n",
-      "Loading best model from roberta-base-finetuned-mnli/checkpoint-2000 (score: 0.8090677534386144).\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='154' max='154' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [154/154 00:18]\n",
-       "    </div>\n",
-       "    "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Configuration saved in roberta-in-bert-trained/config.json\n"
+      "[INFO|trainer.py:437] 2021-12-06 17:39:49,638 >> Using amp half precision backend\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'eval_loss': 0.4835878908634186, 'eval_accuracy': 0.8090677534386144, 'eval_runtime': 19.1183, 'eval_samples_per_second': 513.384, 'eval_steps_per_second': 8.055, 'epoch': 0.16}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Model weights saved in roberta-in-bert-trained/pytorch_model.bin\n"
+      "{'loss': 0.7303, 'learning_rate': 9.1875814863103e-06, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.5143516659736633, 'eval_accuracy': 0.8018339276617422, 'eval_runtime': 18.9153, 'eval_samples_per_second': 518.892, 'eval_steps_per_second': 8.142, 'epoch': 0.08}\n",
+      "{'loss': 0.5419, 'learning_rate': 8.373533246414604e-06, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.4696938693523407, 'eval_accuracy': 0.8183392766174223, 'eval_runtime': 19.0652, 'eval_samples_per_second': 514.813, 'eval_steps_per_second': 8.078, 'epoch': 0.16}\n",
+      "{'loss': 0.5056, 'learning_rate': 7.558670143415907e-06, 'epoch': 0.24}\n",
+      "{'eval_loss': 0.4684630036354065, 'eval_accuracy': 0.819969434538971, 'eval_runtime': 18.5425, 'eval_samples_per_second': 529.326, 'eval_steps_per_second': 8.305, 'epoch': 0.24}\n",
+      "{'loss': 0.4806, 'learning_rate': 6.744621903520209e-06, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.42402705550193787, 'eval_accuracy': 0.8364747834946511, 'eval_runtime': 18.5925, 'eval_samples_per_second': 527.901, 'eval_steps_per_second': 8.283, 'epoch': 0.33}\n",
+      "{'loss': 0.4637, 'learning_rate': 5.929758800521513e-06, 'epoch': 0.41}\n",
+      "{'eval_loss': 0.41743752360343933, 'eval_accuracy': 0.8404482934284259, 'eval_runtime': 18.5681, 'eval_samples_per_second': 528.596, 'eval_steps_per_second': 8.294, 'epoch': 0.41}\n",
+      "{'loss': 0.4501, 'learning_rate': 5.1148956975228174e-06, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.4184797704219818, 'eval_accuracy': 0.8368823229750382, 'eval_runtime': 18.5308, 'eval_samples_per_second': 529.658, 'eval_steps_per_second': 8.31, 'epoch': 0.49}\n",
+      "{'loss': 0.4488, 'learning_rate': 4.3008474576271195e-06, 'epoch': 0.57}\n",
+      "{'eval_loss': 0.397051602602005, 'eval_accuracy': 0.8456444218033622, 'eval_runtime': 18.5969, 'eval_samples_per_second': 527.776, 'eval_steps_per_second': 8.281, 'epoch': 0.57}\n",
+      "{'loss': 0.4404, 'learning_rate': 3.4859843546284226e-06, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.39308467507362366, 'eval_accuracy': 0.8465613856342333, 'eval_runtime': 18.582, 'eval_samples_per_second': 528.201, 'eval_steps_per_second': 8.288, 'epoch': 0.65}\n",
+      "{'loss': 0.4311, 'learning_rate': 2.6711212516297265e-06, 'epoch': 0.73}\n",
+      "{'eval_loss': 0.39400529861450195, 'eval_accuracy': 0.8489047376464595, 'eval_runtime': 18.5238, 'eval_samples_per_second': 529.86, 'eval_steps_per_second': 8.314, 'epoch': 0.73}\n",
+      "{'loss': 0.4226, 'learning_rate': 1.8562581486310302e-06, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.38930612802505493, 'eval_accuracy': 0.8527763627101376, 'eval_runtime': 18.5207, 'eval_samples_per_second': 529.948, 'eval_steps_per_second': 8.315, 'epoch': 0.81}\n",
+      "{'loss': 0.4239, 'learning_rate': 1.0413950456323338e-06, 'epoch': 0.9}\n",
+      "{'eval_loss': 0.38341203331947327, 'eval_accuracy': 0.85206316861946, 'eval_runtime': 18.552, 'eval_samples_per_second': 529.052, 'eval_steps_per_second': 8.301, 'epoch': 0.9}\n",
+      "{'loss': 0.4242, 'learning_rate': 2.2816166883963498e-07, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.3831214904785156, 'eval_accuracy': 0.8536933265410087, 'eval_runtime': 18.5149, 'eval_samples_per_second': 530.113, 'eval_steps_per_second': 8.318, 'epoch': 0.98}\n",
+      "{'train_runtime': 2654.3429, 'train_samples_per_second': 147.947, 'train_steps_per_second': 4.623, 'train_loss': 0.4790087224918052, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.3831214904785156, 'eval_accuracy': 0.8536933265410087, 'eval_runtime': 18.5645, 'eval_samples_per_second': 528.697, 'eval_steps_per_second': 8.295, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.3831214904785156, 'eval_accuracy': 0.8536933265410087, 'eval_runtime': 18.5645, 'eval_samples_per_second': 528.697, 'eval_steps_per_second': 8.295, 'epoch': 1.0}\n"
      ]
     }
    ],
    "source": [
+    "transformers.logging.set_verbosity_error()\n",
     "model_bert = BertForSequenceClassification.from_pretrained(\"roberta-in-bert\", num_labels=num_labels)\n",
     "model_bert = model_bert.cuda()\n",
     "\n",
-    "args.max_steps = 2000\n",
     "trainer = Trainer(\n",
     "    model_bert,\n",
     "    args,\n",
     "    train_dataset=encoded_dataset[\"train\"],\n",
     "    eval_dataset=encoded_dataset[validation_key],\n",
     "    tokenizer=tokenizer,\n",
-    "    compute_metrics=compute_metrics\n",
+    "    compute_metrics=compute_metrics,\n",
     ")\n",
-    "\n",
+    "transformers.logging.set_verbosity_error()\n",
     "trainer.train()\n",
     "print(trainer.evaluate())\n",
     "model_bert.save_pretrained(\"roberta-in-bert-trained\")\n",
@@ -945,7 +830,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Quantization"
+    "### Quantization"
    ]
   },
   {
@@ -982,7 +867,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1001,480 +886,23 @@
     "### Perform calibration\n",
     "\n",
     "During this step we will enable the calibration nodes, and pass some representative data to the model.\n",
-    "It will then be used to compute the scale/range."
+    "It will then be used to compute the scale/range.\n",
+    "\n",
+    "Official recommendations from Nvidia is to calibrate over thousands of examples from the validation set.\n",
+    "Here we use 40*32 examples, because it's a slow process. It's enough to be close from the original accuracy, on your use case, follow Nvidia process."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "metadata": {
-    "scrolled": false
+    "scrolled": true
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "loading configuration file roberta-in-bert-trained/config.json\n",
-      "You are using a model of type bert to instantiate a model of type qdqbert. This is not supported for all configurations of models and can yield errors.\n",
-      "Model config QDQBertConfig {\n",
-      "  \"_name_or_path\": \"roberta-in-bert\",\n",
-      "  \"architectures\": [\n",
-      "    \"BertForSequenceClassification\"\n",
-      "  ],\n",
-      "  \"attention_probs_dropout_prob\": 0.1,\n",
-      "  \"bos_token_id\": 0,\n",
-      "  \"classifier_dropout\": null,\n",
-      "  \"eos_token_id\": 2,\n",
-      "  \"hidden_act\": \"gelu\",\n",
-      "  \"hidden_dropout_prob\": 0.1,\n",
-      "  \"hidden_size\": 768,\n",
-      "  \"id2label\": {\n",
-      "    \"0\": \"LABEL_0\",\n",
-      "    \"1\": \"LABEL_1\",\n",
-      "    \"2\": \"LABEL_2\"\n",
-      "  },\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"label2id\": {\n",
-      "    \"LABEL_0\": 0,\n",
-      "    \"LABEL_1\": 1,\n",
-      "    \"LABEL_2\": 2\n",
-      "  },\n",
-      "  \"layer_norm_eps\": 1e-05,\n",
-      "  \"max_position_embeddings\": 514,\n",
-      "  \"model_type\": \"qdqbert\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 12,\n",
-      "  \"pad_token_id\": 1,\n",
-      "  \"position_embedding_type\": \"absolute\",\n",
-      "  \"problem_type\": \"single_label_classification\",\n",
-      "  \"torch_dtype\": \"float32\",\n",
-      "  \"transformers_version\": \"4.13.0.dev0\",\n",
-      "  \"type_vocab_size\": 1,\n",
-      "  \"use_cache\": true,\n",
-      "  \"vocab_size\": 50265\n",
-      "}\n",
-      "\n",
-      "loading weights file roberta-in-bert-trained/pytorch_model.bin\n",
-      "I1201 19:07:22.556784 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.557633 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.558312 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.559234 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.570187 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.570979 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.572141 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.573173 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.587157 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.587976 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.588648 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.589257 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.589886 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.590491 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.591093 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.591677 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.604110 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.604965 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.605788 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.606476 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.608637 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.609386 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.633528 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.634159 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.634558 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.635272 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.659617 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.660160 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.660545 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.661419 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.662083 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.662428 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.672487 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.672940 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.673371 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.673864 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.686383 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.687393 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.688224 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.689111 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.701097 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.702034 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.702666 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.703149 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.703629 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.704051 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.704477 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.704874 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.716984 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.717729 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.718373 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.718999 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.719850 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.720462 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.741302 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.741899 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.742321 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.742763 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.769119 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.769680 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.770210 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.771095 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.771780 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.772149 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.783096 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.783622 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.784070 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.784542 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.794926 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:22.795491 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.795853 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.796230 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.807137 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.807927 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.808553 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.809141 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.809771 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.810495 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.811114 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.811691 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.822019 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.822805 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.823449 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.824043 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.824914 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.825503 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.846135 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.846942 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.847608 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.848223 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.871000 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.871783 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.872462 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.873089 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.873962 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.874621 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.884854 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.885457 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.886066 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.886688 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.899109 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.899768 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.900748 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.901568 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.911406 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.912059 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.912927 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.913561 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.914347 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.915078 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.915945 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.916547 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.927549 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.928246 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.928908 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.929500 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.930580 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.931228 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.951412 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.952154 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.952960 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.953933 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.974907 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.975683 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.976436 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.977675 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:22.978751 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.979352 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.991469 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:22.992411 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:22.993380 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:22.994207 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.005038 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.005791 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.006553 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.007203 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.017255 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.017829 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.018506 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.019221 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.019879 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.020379 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.020857 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.021332 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.031261 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.031948 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.032585 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.033192 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.034027 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.034641 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.058038 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:23.058811 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.059446 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.060050 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.079895 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.080400 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.080774 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.081119 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.081963 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.082303 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.092197 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.092940 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.093587 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.094182 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.104809 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.105530 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.106288 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.107035 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.117332 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.117931 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.118697 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.119329 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.120085 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.120682 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.121240 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.121816 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.131671 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.132045 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.132399 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.132738 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.133545 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.133875 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.153260 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.153972 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.154779 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.155387 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.179506 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.180439 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.181103 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.181725 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.182586 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.183223 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.194004 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.195079 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.196422 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.196865 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.208515 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.209109 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.209630 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.210128 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.221330 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.221704 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.222061 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.222406 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.222798 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.223171 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.223591 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.223986 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.235521 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.236109 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.236990 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.237547 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.238431 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.239129 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.262135 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.262805 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.263666 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.264140 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.287677 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.288471 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.289235 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.290506 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.291714 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.292328 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.303590 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.304338 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.304985 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.305601 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.316265 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.316930 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.317543 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.318154 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:23.328983 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.329701 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.330348 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.331055 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.331630 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.332135 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.332654 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.333236 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.343668 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.344560 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.345513 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.346140 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.346822 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.347609 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.370694 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.371253 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.371692 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.372114 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.395322 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.396442 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.397025 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.397402 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.398226 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.398845 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.412521 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.413233 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.413730 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.414105 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.424964 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.425662 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.426278 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.426805 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.437678 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.438106 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.438483 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.438826 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.439212 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.439549 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.439886 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.440611 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.451024 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.451713 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.452302 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.452893 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.453708 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.454290 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.479227 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.480020 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.480688 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.481298 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.503707 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.504791 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.505503 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.506218 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.507561 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.508140 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.519340 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.519988 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.520587 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.521208 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.534733 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.535418 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.536240 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.537214 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.548016 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.548620 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.549396 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.550175 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.550990 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.551932 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.552865 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.553758 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.564883 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.565361 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.565732 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.566088 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.567098 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.567529 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.590969 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.592016 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.592971 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.593897 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:23.619890 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.620731 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.621435 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.622109 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.623023 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.623664 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.634196 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.634888 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.635725 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.636688 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.657259 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.657959 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.658629 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.659231 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.672862 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.673597 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.674302 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.674978 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.675721 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.676385 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.677051 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.677688 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.689925 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.690466 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.690964 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.691452 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.692585 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.693069 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.717059 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.717769 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.718341 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.718904 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.741673 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.742345 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.742954 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.743494 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.744401 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.745055 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.755685 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.756275 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.756812 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.757349 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.767277 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.767896 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.768507 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.769093 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.779711 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.780525 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.781466 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.782547 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.783240 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.783874 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.784468 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.785077 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.795897 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.796726 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.797205 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.797985 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.798608 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.798972 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.822418 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.823235 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.823901 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.824537 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.847674 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:07:23.848276 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:07:23.848831 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.849364 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:07:23.850354 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:07:23.851019 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "All model checkpoint weights were used when initializing QDQBertForSequenceClassification.\n",
-      "\n",
-      "All the weights of QDQBertForSequenceClassification were initialized from the model checkpoint at roberta-in-bert-trained.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use QDQBertForSequenceClassification for predictions without further training.\n"
-     ]
-    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "dc9c4f7d994a43238fc98b0a4a82a76b",
+       "model_id": "8ed1b47f25084ffb98165b5a5ba60d22",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1485,526 +913,10 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:24.008212 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.008661 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.008988 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.009320 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.009638 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.009947 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.010272 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.010650 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.011093 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.011521 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.011932 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.012334 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.012732 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.013158 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.013524 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.015070 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.015362 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.015657 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.015948 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.016235 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.016535 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.016821 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.017107 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.017389 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.017682 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.017965 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.018249 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.018543 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.018842 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.019130 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.019419 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.019713 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.020005 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.020288 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.020587 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.020866 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.021157 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.021440 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.021725 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.022008 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.022335 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.022688 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.023005 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.023291 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.023587 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.023878 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.024167 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.024456 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.024747 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.025036 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.025317 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.025598 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.025888 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.026176 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.026575 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.029913 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.030232 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.030552 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.030859 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.031172 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.031473 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.031761 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.032055 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.032341 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.032632 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.032923 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.033217 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.033504 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.033799 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.034101 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.034383 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.034724 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.035037 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.035333 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.035627 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.035917 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.036211 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.036498 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.036792 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.037077 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.037374 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.037710 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.038072 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.038429 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.038773 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.039572 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.040231 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.040872 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.041512 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.042466 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.042991 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.043453 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.043893 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.044385 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.044853 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:24.045287 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.045724 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.046156 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.046822 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.047271 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.047720 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.048171 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.048631 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.049073 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.049527 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.049985 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.050450 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.050935 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.051433 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.051962 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.052431 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.052842 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.053302 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.053708 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.054098 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.054503 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.054916 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.055331 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.055722 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.056115 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.056530 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.056917 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.057300 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.057678 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.058071 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.058478 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.058875 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.059275 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.059674 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.060062 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.060468 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.060877 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.061275 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.061657 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.062041 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.062422 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.062809 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.063184 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.063562 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.063941 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.064331 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.064721 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.065103 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.065503 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.065927 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.066313 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.066744 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.067138 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.067527 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.067911 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.068341 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.068727 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.069152 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.069553 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.069929 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.070332 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.070743 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.071141 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.071579 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.072017 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.072420 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.072801 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.073190 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.073567 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.073958 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.074336 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.074850 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.075454 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.075989 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.076356 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.076695 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.077060 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.077408 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.077838 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.078272 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.078711 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.079138 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.079486 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.079871 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.080217 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.080585 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.080916 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.081352 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.081695 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.082050 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.082368 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.082800 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.083172 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.083823 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:24.084179 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.084587 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.084959 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.092437 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.093339 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.094649 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.099028 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.099557 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.100039 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.100597 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.100946 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.101308 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.101635 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.102412 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.102815 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.103189 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.103542 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.103915 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.104271 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.104646 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.105073 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.105480 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.105823 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.106204 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.106556 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.106930 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.107255 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.107614 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.107955 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.108338 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.108674 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.109040 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.109416 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.109760 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.110115 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.110567 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.110929 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.111307 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.111885 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.112311 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.112704 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.115879 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.116308 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.116706 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.117094 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.117460 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.117829 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.118185 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.118570 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.118924 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.119296 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.119665 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.120041 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.120388 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.120766 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.121122 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.121490 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.121852 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.122225 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.122589 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.122989 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.123348 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.123733 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.125864 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.126237 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.126607 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.127273 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.127610 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.128101 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.128500 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.128862 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.129232 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.129593 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.129964 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.130370 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.131626 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.132100 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.132482 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.132864 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.133640 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.134005 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.134386 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.134849 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.135259 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.135945 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.136400 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.136772 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.137124 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.137450 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.137776 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.138093 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.138446 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.138790 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.139108 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.139424 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:24.139753 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.140066 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.140388 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.140700 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.141023 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.141333 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.141650 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.141971 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.142299 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.142667 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.142989 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.143304 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.143644 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.144004 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.144335 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.144657 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.144972 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.145276 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.145586 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.145884 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.146196 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.146507 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.146830 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.147159 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.147554 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.148074 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.148501 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.148970 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.149340 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.149654 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.149964 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.150276 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.150603 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.150904 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.151208 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.151526 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.151910 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.152282 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.152642 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.153004 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.153381 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.153741 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.154193 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.154572 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.154937 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.155293 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.155663 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.156019 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.156385 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.156755 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.157115 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.157469 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.157838 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.158190 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.158565 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.158932 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.159324 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.159684 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.160052 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.160408 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.160774 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.161141 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.161501 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.161860 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.162240 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.162612 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.163017 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.163409 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.163841 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.164212 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.165174 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.165847 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.166251 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.166768 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.167108 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.167433 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.167788 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.168111 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.168436 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.168760 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.169080 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.169394 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.169714 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.170035 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.170361 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.170723 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.171069 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.171420 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.171787 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.172127 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.172460 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.172802 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.173132 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.173461 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.173796 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:24.174127 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.174479 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.174819 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.175151 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.175482 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.175847 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.176198 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.176537 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.176899 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.177248 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.177613 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.177950 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.178279 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.178630 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.178949 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.179286 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.179632 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.179993 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.180335 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.180680 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.181045 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.181429 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.181766 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.182112 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.182474 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.182854 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.183246 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.183611 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.183982 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.184353 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.184703 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.185055 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.185404 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.185768 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.186109 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.186471 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.186816 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.187159 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.187501 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.187842 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.188180 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.188540 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.188892 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.189239 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.189582 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.189941 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.190281 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.190637 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.190984 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.191337 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.191706 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.192056 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.192404 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.192754 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.193092 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.193433 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.193799 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.194635 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.195046 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.195410 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.195775 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.196141 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.196555 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.196925 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.197286 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.197643 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.198002 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.198354 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.198724 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.199099 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.199455 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.199810 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.200161 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.200533 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.200872 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.201215 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.201560 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.201900 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.202245 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.202603 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.202946 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.203296 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.203634 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.203979 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.204316 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.204662 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.204997 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.205339 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.205677 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.206024 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.206361 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.206710 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.207053 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.207409 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.207747 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:24.208091 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.208422 140057592104768 tensor_quantizer.py:179] Enable MaxCalibrator\n",
-      "I1201 19:07:24.208777 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.209120 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n",
-      "I1201 19:07:24.209468 140057592104768 tensor_quantizer.py:183] Disable `quant` stage.\n",
-      "I1201 19:07:24.209815 140057592104768 tensor_quantizer.py:179] Enable HistogramCalibrator\n"
-     ]
-    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fee5af3b54564a6d9cf01e84a99b2e34",
+       "model_id": "27b6e88f50be4e229bdcd4e992a83467",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2016,1104 +928,722 @@
      "output_type": "display_data"
     },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:07:24.249449 140057592104768 histogram.py:69] Calibrator encountered negative values. It shouldn't happen after ReLU. Make sure this is the right tensor to calibrate.\n",
-      "I1201 19:07:24.322341 140057592104768 max.py:60] Calibrator encountered negative values. It shouldn't happen after ReLU. Make sure this is the right tensor to calibrate.\n",
-      "W1201 19:11:25.924171 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "W1201 19:11:25.925113 140057592104768 tensor_quantizer.py:238] Call .cuda() if running on GPU after loading calibrated amax.\n",
-      "I1201 19:11:25.925732 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.926429 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.926960 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.927570 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.928478 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.929265 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.929759 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.930281 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.930794 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.931309 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.931917 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.932722 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.933320 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.933851 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.935375 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.936755 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.937237 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.938004 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.938564 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.939165 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.940878 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.941240 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.941617 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.942476 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.943017 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.943430 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.943952 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.944334 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.944707 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.945207 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.945593 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.945962 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.946330 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.946732 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.947101 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.948596 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.948939 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.949308 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.949787 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.950204 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.950607 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.951201 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.952383 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.952890 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.953278 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:25.953657 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.954019 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.954503 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.954883 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.955243 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.955616 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.955986 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.956342 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.956829 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.957189 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.957549 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.959388 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.959772 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.960143 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.960639 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.961327 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.961693 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.962077 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.962476 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.962854 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.963334 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.963722 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.964087 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.964465 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.964836 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.965192 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.966062 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.967302 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.967694 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.968163 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.968578 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.968949 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.969423 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.969820 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.970181 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.971506 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:11:25.971887 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.972258 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.973086 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.973466 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.973832 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.974312 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.974988 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.975597 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.976069 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.976484 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.977218 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.977980 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.978734 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.979220 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.979803 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.980222 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.980617 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.981182 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.983010 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.983466 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.984857 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.985381 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.985968 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.986622 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:25.987133 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.987572 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.988140 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.989069 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.989492 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.990689 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.991171 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.991599 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.992151 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.992578 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.993011 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.994227 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.994624 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.995114 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.996038 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.996421 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.996902 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:25.997333 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:25.997764 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.998190 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:25.998763 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:25.999198 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:25.999616 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.000059 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.000492 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.000916 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.001345 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.001671 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.002368 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.002961 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.003427 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.003854 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.004457 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.004840 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.005265 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.005782 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.008827 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.009523 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.010430 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.011012 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.011617 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.012572 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.013204 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.013811 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.014614 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.015094 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.015529 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.015997 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.016479 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.016898 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.018180 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.018682 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.019285 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.020093 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.020559 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.021015 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.021934 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.022428 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.022884 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.023341 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.023761 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.024163 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.025327 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:11:26.025758 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.026160 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.026600 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.027027 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.027422 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.028437 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.028844 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.029254 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.030018 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.030419 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.030812 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.031607 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.031997 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.032397 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.032814 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.033215 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.033596 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.034605 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.035006 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.035410 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.035823 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.036223 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.036654 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.037652 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.038041 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.038457 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.038881 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.039296 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.039678 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.040672 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.041069 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.041475 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.042263 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.042666 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.043061 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.043828 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.044213 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.044603 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.045393 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.045792 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.046184 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.046998 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.047406 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.047809 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.048225 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.048639 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.049020 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.050037 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.050438 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.051250 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.052364 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.052836 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.053314 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.054255 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.054712 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.055194 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.055634 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.056124 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.056620 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.057900 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.058500 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.058978 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.059447 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.059946 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.060393 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.062382 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.063033 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.063494 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.064432 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.064884 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.065327 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.066207 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.066751 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.067196 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.067848 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.068386 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.068830 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.069777 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.070216 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.070670 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.071287 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.071821 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.072246 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.073431 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.073822 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.074197 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.074593 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1201 19:11:26.074990 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.075353 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.076299 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.076682 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.077061 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.077822 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.078284 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.078689 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.079439 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.079832 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.080327 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.080979 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.081813 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.082364 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.083022 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.083666 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.084136 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.084990 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.085494 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.085910 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.086870 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.087291 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.087764 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.088692 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.089114 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.089588 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.090586 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.091059 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.091480 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.091969 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.092467 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.092957 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.094229 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.094680 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.095173 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.095866 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.096348 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.096778 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.097883 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.098311 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.098751 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.099691 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.100139 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.100568 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.101492 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.101877 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.102281 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.102719 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.103145 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.103531 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.104698 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.105144 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.105577 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.106037 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.106483 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.106919 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.107499 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.108504 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.108941 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.109427 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.109966 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.110471 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.111276 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.112012 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.112491 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.113208 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.113619 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.114001 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.114477 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.114864 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.115222 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.115676 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.117420 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.117794 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.118367 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.118748 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.119553 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.119925 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.120306 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.120646 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.121500 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.121852 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.122227 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.123033 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.123393 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.123810 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "calibration\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "W1201 19:11:26.124270 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.124982 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.125337 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.125682 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.126024 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.126360 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.126821 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.127168 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.127512 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.127857 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.128201 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.128534 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.130064 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.130435 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.130790 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.131241 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.132012 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.132446 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.133346 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.133914 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.134443 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.134913 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.135293 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.135643 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.136127 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.136558 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.136909 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.137258 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.137603 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.137944 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.138386 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.138761 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.139133 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.139504 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.139860 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.140220 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.142096 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.142473 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.142842 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.143534 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.144126 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.144506 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.144970 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.145310 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.145627 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.146053 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.146389 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.146715 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.147159 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.147586 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.147955 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.148300 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.148642 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.149002 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.150712 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.151114 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.151485 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.152328 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.152645 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.152975 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.153701 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.154030 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.154361 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.154783 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.155122 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.155440 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.155856 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.156772 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.157073 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.157397 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.157718 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.158034 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.158441 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.159212 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.159476 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.159883 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.160531 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.160816 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.161237 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.161567 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.161873 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.162191 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.162541 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.162867 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.163278 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.163607 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.163912 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "W1201 19:11:26.164236 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.164555 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.164862 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.165282 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.166933 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.167483 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.167847 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.168207 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.168518 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.169337 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.169656 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.169963 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.170650 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.170993 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.171299 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.171710 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.172366 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.172665 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.173085 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.173456 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.173775 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.174204 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.174545 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.174849 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.175172 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.175492 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.175794 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.177029 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.177344 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.177689 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.178099 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.178759 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.179049 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.179460 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.179778 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.180079 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.180411 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.180738 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.181051 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.181462 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.181764 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.182077 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.182408 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.182742 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.183166 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.185135 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.185495 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.185858 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.186303 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.186998 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.187295 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.187743 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.188093 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.188408 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.188751 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.189090 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.189402 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.189831 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.190161 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.190501 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.190845 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.191191 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.191504 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.191941 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.192252 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.192575 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.192912 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.193243 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.193555 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.195458 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.195774 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.196097 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.196519 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.197222 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.197522 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.198172 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.198496 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.198834 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.199260 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.199592 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.199915 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.200341 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.200659 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.201111 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.201585 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.201978 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.202338 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "W1201 19:11:26.203794 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.204129 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.204463 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.205213 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.205532 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.205856 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.206314 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.206920 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.207242 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.207595 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.207930 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.208251 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.208701 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.209014 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.209342 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.209681 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.210017 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.210337 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.210809 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.211127 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.211452 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.212960 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.213278 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.213597 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.214320 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.214647 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.214969 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.215307 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.215657 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.215990 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.216437 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.216790 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.217120 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.217470 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.218750 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.219104 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.219779 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.220130 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.220471 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.220828 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.221171 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.221508 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.222415 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.222771 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.223115 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.223901 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.224342 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.224699 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.225436 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.225807 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.226158 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.226867 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.227223 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.227569 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.228272 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.228614 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.228950 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.229305 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.229648 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.229976 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.230901 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.231261 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.231602 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.232302 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.232648 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.232999 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.233716 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.234098 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.234535 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.234896 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.235247 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.235577 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.236498 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.236858 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.237208 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.237540 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.237884 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.238211 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.239141 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.239490 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.239825 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.240611 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.241239 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.241616 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.242406 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.242787 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.243120 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "W1201 19:11:26.243480 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.243837 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.244161 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.245077 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.245416 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.245760 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.246103 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.246461 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.246803 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.247718 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.248057 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.248391 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.248733 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.249096 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.249467 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.250488 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.250917 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.251284 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.251981 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.252321 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.252659 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.253424 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.253845 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.254222 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.255028 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.255452 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.255849 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.256643 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.257052 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.257397 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.257740 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.258353 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.258865 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.259701 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.260312 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.260719 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.261451 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.261847 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.262240 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.263064 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.263426 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.263779 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.264135 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.264503 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.264840 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.265755 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.266181 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.266593 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.267354 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.267729 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.268307 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.268963 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.269316 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.269653 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.270340 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.270828 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.271171 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.271849 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.272221 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.272570 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.272931 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.273329 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.273719 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.274829 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.275363 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.275777 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.276152 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.276521 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.276897 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.277840 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.278287 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.278729 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.279121 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.279488 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.279873 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.280927 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.281318 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.281684 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.282429 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.282856 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.283243 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.284163 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.284554 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.284918 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.285676 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.286061 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.286433 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "W1201 19:11:26.287186 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.287554 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.287908 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.288288 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.288911 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.289278 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.289983 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.290372 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.290744 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.291477 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.291844 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.292200 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.292936 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.293301 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.293728 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.294102 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([3072, 1]).\n",
-      "I1201 19:11:26.294512 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.294884 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.295844 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.296213 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.296561 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.296933 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([768, 1]).\n",
-      "I1201 19:11:26.297616 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.297944 140057592104768 tensor_quantizer.py:173] Disable MaxCalibrator\n",
-      "W1201 19:11:26.298643 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.299016 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.299408 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "W1201 19:11:26.300342 140057592104768 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).\n",
-      "I1201 19:11:26.300858 140057592104768 tensor_quantizer.py:187] Enable `quant` stage.\n",
-      "W1201 19:11:26.301294 140057592104768 tensor_quantizer.py:173] Disable HistogramCalibrator\n",
-      "Configuration saved in roberta-in-bert-trained-quantized/config.json\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bert.encoder.layer.0.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=4.3667 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.2286, 0.7135](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=4.3667 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.2130, 0.8616](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=4.3667 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0553, 0.3021](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=20.9483 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=19.1295 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.9434 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.9990 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.7202 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0895, 0.8283](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=2.3058 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=4.3667 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=16.3581 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0793, 0.9986](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=4.7207 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1532, 1.0122](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=8.5718 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.0.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=16.3581 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=15.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1721, 0.5387](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=15.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1740, 0.7034](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=15.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0696, 0.3664](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.7817 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.8797 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=4.3953 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.9966 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.9805 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0581, 0.8137](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=2.8595 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=15.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=24.1278 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0945, 0.9753](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=2.5225 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1333, 1.0380](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=5.5873 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.1.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=24.1278 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.4631 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1529, 0.5227](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.4631 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1589, 0.6583](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.4631 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0926, 0.5707](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.8016 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=8.9038 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.5037 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.8067 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.3959 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0550, 0.5825](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=1.5141 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=22.4631 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=25.4978 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0963, 0.6521](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=5.4928 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1195, 0.9864](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=8.5919 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.2.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=25.4978 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=19.6996 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1667, 0.6095](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=19.6996 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1594, 0.6760](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=19.6996 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1110, 0.4958](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=11.2620 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.0273 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.9673 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.6460 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.3340 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0491, 0.6411](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=1.4833 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=19.6996 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=23.9052 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0916, 0.6947](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=5.3703 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1261, 1.0282](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=4.7693 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.3.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=23.9052 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=18.7359 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1819, 0.6175](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=18.7359 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1740, 0.6784](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=18.7359 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0960, 0.3665](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.7623 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.3514 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.9243 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5537 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.2916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0238, 0.5530](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=2.6874 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=18.7359 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.2396 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0951, 0.6780](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=6.2925 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.0993, 1.0144](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=5.8652 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.4.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.2396 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=20.8635 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1435, 0.5507](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=20.8635 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1387, 0.6393](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=20.8635 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1045, 0.3715](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.5283 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.4352 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.9734 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5589 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.6919 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0134, 0.5827](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=1.9668 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=20.8635 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.9879 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0820, 0.6016](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=2.8023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1060, 1.0209](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=3.0022 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.5.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.9879 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=21.0328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1642, 0.6050](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=21.0328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1604, 0.6274](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=21.0328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1190, 0.4425](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.7586 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=10.1608 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.0164 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5824 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.1733 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0716, 0.5062](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=2.7819 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=21.0328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.0411 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1003, 0.6700](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=2.4391 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1192, 1.0108](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=5.5336 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.6.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.0411 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.1518 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1625, 0.6395](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.1518 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1711, 0.6029](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.1518 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0622, 0.3252](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=8.8614 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.6074 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.7348 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.6704 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.4070 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0548, 0.6221](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=4.4532 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=22.1518 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.6406 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0800, 0.7407](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=2.6607 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1217, 1.2869](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=7.7671 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.7.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.6406 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.6328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1686, 0.5366](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=22.6328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1653, 0.6490](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=22.6328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0951, 0.3531](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.3184 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.5218 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5794 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=1.8379 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0489, 0.5719](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=6.6549 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=22.6328 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=21.8385 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0863, 0.5819](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=3.4752 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1235, 1.3031](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=7.2133 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.8.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=21.8385 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.query._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=21.9573 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.query._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.1544, 0.5243](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.key._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=21.9573 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.key._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.1549, 0.5846](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.value._input_quantizer                       TensorQuantizer(8bit fake per-tensor amax=21.9573 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.value._weight_quantizer                      TensorQuantizer(8bit fake axis=(0,) amax=[0.0839, 0.3537](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.matmul_q_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=9.2343 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.matmul_k_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=8.9173 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.matmul_v_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=3.0295 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.self.matmul_a_input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=0.5832 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.output.dense._input_quantizer                     TensorQuantizer(8bit fake per-tensor amax=2.2727 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.output.dense._weight_quantizer                    TensorQuantizer(8bit fake axis=(0,) amax=[0.0700, 0.5093](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.output.add_local_input_quantizer                  TensorQuantizer(8bit fake per-tensor amax=6.9703 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.attention.output.add_residual_input_quantizer               TensorQuantizer(8bit fake per-tensor amax=21.9573 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.intermediate.dense._input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=20.2072 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.intermediate.dense._weight_quantizer                        TensorQuantizer(8bit fake axis=(0,) amax=[0.0692, 0.5374](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.output.dense._input_quantizer                               TensorQuantizer(8bit fake per-tensor amax=3.5973 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.output.dense._weight_quantizer                              TensorQuantizer(8bit fake axis=(0,) amax=[0.1114, 1.0897](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.output.add_local_input_quantizer                            TensorQuantizer(8bit fake per-tensor amax=6.9660 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.9.output.add_residual_input_quantizer                         TensorQuantizer(8bit fake per-tensor amax=20.2072 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.query._input_quantizer                      TensorQuantizer(8bit fake per-tensor amax=21.2327 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.query._weight_quantizer                     TensorQuantizer(8bit fake axis=(0,) amax=[0.1554, 0.5088](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.key._input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=21.2327 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.key._weight_quantizer                       TensorQuantizer(8bit fake axis=(0,) amax=[0.1604, 0.5558](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.value._input_quantizer                      TensorQuantizer(8bit fake per-tensor amax=21.2327 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.value._weight_quantizer                     TensorQuantizer(8bit fake axis=(0,) amax=[0.0912, 0.2958](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.matmul_q_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=7.7721 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.matmul_k_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=8.4632 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.matmul_v_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=3.1891 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.self.matmul_a_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=0.6085 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.output.dense._input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=2.7075 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.output.dense._weight_quantizer                   TensorQuantizer(8bit fake axis=(0,) amax=[0.0828, 0.5451](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.output.add_local_input_quantizer                 TensorQuantizer(8bit fake per-tensor amax=8.5943 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.attention.output.add_residual_input_quantizer              TensorQuantizer(8bit fake per-tensor amax=21.2327 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.intermediate.dense._input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=16.0322 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.intermediate.dense._weight_quantizer                       TensorQuantizer(8bit fake axis=(0,) amax=[0.0847, 0.5931](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.output.dense._input_quantizer                              TensorQuantizer(8bit fake per-tensor amax=2.9404 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.output.dense._weight_quantizer                             TensorQuantizer(8bit fake axis=(0,) amax=[0.1075, 1.0234](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.output.add_local_input_quantizer                           TensorQuantizer(8bit fake per-tensor amax=4.3903 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.10.output.add_residual_input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=16.0322 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.query._input_quantizer                      TensorQuantizer(8bit fake per-tensor amax=14.1766 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.query._weight_quantizer                     TensorQuantizer(8bit fake axis=(0,) amax=[0.1734, 0.5519](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.key._input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=14.1766 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.key._weight_quantizer                       TensorQuantizer(8bit fake axis=(0,) amax=[0.1755, 0.5546](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.value._input_quantizer                      TensorQuantizer(8bit fake per-tensor amax=14.1766 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.value._weight_quantizer                     TensorQuantizer(8bit fake axis=(0,) amax=[0.1063, 0.3849](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.matmul_q_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=8.6127 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.matmul_k_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=7.4551 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.matmul_v_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=3.4774 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.self.matmul_a_input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=0.6208 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.output.dense._input_quantizer                    TensorQuantizer(8bit fake per-tensor amax=3.7673 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.output.dense._weight_quantizer                   TensorQuantizer(8bit fake axis=(0,) amax=[0.0971, 0.5766](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.output.add_local_input_quantizer                 TensorQuantizer(8bit fake per-tensor amax=9.5412 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.attention.output.add_residual_input_quantizer              TensorQuantizer(8bit fake per-tensor amax=14.1766 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.intermediate.dense._input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=8.4928 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.intermediate.dense._weight_quantizer                       TensorQuantizer(8bit fake axis=(0,) amax=[0.0982, 0.3916](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.output.dense._input_quantizer                              TensorQuantizer(8bit fake per-tensor amax=4.6082 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.output.dense._weight_quantizer                             TensorQuantizer(8bit fake axis=(0,) amax=[0.0956, 1.0014](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.output.add_local_input_quantizer                           TensorQuantizer(8bit fake per-tensor amax=14.0034 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "bert.encoder.layer.11.output.add_residual_input_quantizer                        TensorQuantizer(8bit fake per-tensor amax=8.4928 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-      "240 TensorQuantizers found in model\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Model weights saved in roberta-in-bert-trained-quantized/pytorch_model.bin\n"
-     ]
+     "data": {
+      "text/plain": [
+       "QDQBertForSequenceClassification(\n",
+       "  (bert): QDQBertModel(\n",
+       "    (embeddings): QDQBertEmbeddings(\n",
+       "      (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
+       "      (position_embeddings): Embedding(514, 768)\n",
+       "      (token_type_embeddings): Embedding(1, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (encoder): QDQBertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.3825 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.2278, 0.7138](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.3825 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.2136, 0.8620](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.3825 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0559, 0.3011](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=20.8514 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0919 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8985 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.9990 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.6973 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0893, 0.8268](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.7937 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.3825 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.6473 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0784, 0.9981](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.0149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1539, 1.0117](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.3320 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.6473 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (1): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.9149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1719, 0.5387](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.9149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1739, 0.7034](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.9149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0692, 0.3668](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.8894 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=11.1935 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.4753 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.9980 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.0571 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0568, 0.8128](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8345 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.9149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.7561 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0939, 0.9761](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.4792 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1338, 1.0384](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.1303 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.7561 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (2): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1519, 0.5226](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1586, 0.6574](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0927, 0.5691](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.8590 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.8922 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.5050 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.7881 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.5292 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0546, 0.5824](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.5733 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=25.3726 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0962, 0.6515](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.4743 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1189, 0.9865](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.3749 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=25.3726 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (3): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.5269 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1655, 0.6085](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.5269 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1591, 0.6744](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.5269 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1096, 0.4942](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=11.2382 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.1940 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.0930 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.6143 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0487, 0.6398](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.4685 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.5269 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.5186 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0921, 0.6942](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.5658 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1252, 1.0283](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.2256 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.5186 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (4): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0308 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1818, 0.6171](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0308 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1737, 0.6774](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0308 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0965, 0.3672](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.3299 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.9871 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.9571 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5410 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.0579 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0243, 0.5534](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.5376 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0308 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8352 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0947, 0.6763](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.8965 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1001, 1.0148](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.1937 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8352 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (5): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.2023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1441, 0.5500](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.2023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1398, 0.6392](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.2023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1045, 0.3702](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.6675 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.5083 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.8853 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5531 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.7910 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0132, 0.5822](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.0695 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.2023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.2437 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0828, 0.6019](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.6517 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1048, 1.0222](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.9217 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.2437 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (6): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1642, 0.6043](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1595, 0.6278](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1190, 0.4426](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.4666 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.9298 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.0881 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5958 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.0587 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0709, 0.5058](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3172 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.1748 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1002, 0.6699](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.4878 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1194, 1.0115](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=6.5609 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.1748 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (7): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1621, 0.6402](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1712, 0.6015](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0624, 0.3250](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.8818 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.0426 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8084 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.7217 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8457 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0550, 0.6221](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.8872 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3410 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0798, 0.7414](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8300 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1221, 1.2854](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=11.7479 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3410 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (8): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1674, 0.5365](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1649, 0.6477](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0949, 0.3530](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.9038 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.0857 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3595 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5520 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.8644 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0494, 0.5720](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=6.3703 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.0888 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0869, 0.5807](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3958 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1237, 1.3025](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.2477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.0888 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (9): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.7030 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1545, 0.5240](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.7030 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1542, 0.5843](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.7030 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0838, 0.3533](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.8699 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.0611 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3043 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5683 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.1081 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0701, 0.5085](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=6.3062 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.7030 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3909 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0689, 0.5368](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.6746 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1116, 1.0891](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.0953 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3909 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (10): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.7367 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1565, 0.5091](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.7367 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1599, 0.5542](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.7367 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0913, 0.2951](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.3141 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.3140 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.5668 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.6283 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3139 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0822, 0.5450](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.0508 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.7367 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.8718 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0845, 0.5929](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.2979 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1073, 1.0226](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.1809 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.8718 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "        (11): QDQBertLayer(\n",
+       "          (attention): QDQBertAttention(\n",
+       "            (self): QDQBertSelfAttention(\n",
+       "              (query): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.6916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1717, 0.5519](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (key): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.6916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1763, 0.5545](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (value): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.6916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1061, 0.3834](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.1137 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=7.2776 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.7551 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5068 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (output): QDQBertSelfOutput(\n",
+       "              (dense): QuantLinear(\n",
+       "                in_features=768, out_features=768, bias=True\n",
+       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.7846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0972, 0.5767](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "              )\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=7.4840 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.6916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): QDQBertIntermediate(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=768, out_features=3072, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=12.2354 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0989, 0.3906](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "          )\n",
+       "          (output): QDQBertOutput(\n",
+       "            (dense): QuantLinear(\n",
+       "              in_features=3072, out_features=768, bias=True\n",
+       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.4522 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0954, 0.9991](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
+       "            )\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=12.8952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=12.2354 calibrator=HistogramCalibrator scale=1.0 quant)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (pooler): QDQBertPooler(\n",
+       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "      (activation): Tanh()\n",
+       "    )\n",
+       "  )\n",
+       "  (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  (classifier): Linear(in_features=768, out_features=3, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained\", num_labels=num_labels)\n",
-    "model_q = model_q.cuda()\n",
-    "\n",
-    "# Find the TensorQuantizer and enable calibration\n",
-    "for name, module in tqdm(model_q.named_modules()):\n",
-    "    if isinstance(module, quant_nn.TensorQuantizer):\n",
-    "        if module._calibrator is not None:\n",
-    "            module.disable_quant()\n",
-    "            module.enable_calib()\n",
-    "        else:\n",
-    "            module.disable()\n",
-    "\n",
-    "with torch.no_grad():\n",
-    "    for start_index in tqdm(range(0, 4*batch_size, batch_size)):\n",
-    "        end_index = start_index + batch_size\n",
-    "        data = encoded_dataset[\"train\"][start_index:end_index]\n",
-    "        input_torch = {k: torch.tensor(list(v), dtype=torch.long, device=\"cuda\")\n",
-    "                       for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
-    "        model_q(**input_torch)\n",
     "\n",
+    "calibrate(model=model_q, encoded_dataset=encoded_dataset)\n",
     "\n",
-    "print(\"calibration\")\n",
-    "# Finalize calibration\n",
-    "for name, module in model_q.named_modules():\n",
-    "    if isinstance(module, quant_nn.TensorQuantizer):\n",
-    "        if module._calibrator is not None:\n",
-    "            if isinstance(module._calibrator, calib.MaxCalibrator):\n",
-    "                module.load_calib_amax()\n",
-    "            else:\n",
-    "                module.load_calib_amax(\"percentile\", percentile=99.99)\n",
-    "            module.enable_quant()\n",
-    "            module.disable_calib()\n",
-    "        else:\n",
-    "            module.enable()\n",
-    "\n",
-    "model_q.cuda()\n",
-    "\n",
-    "count = 0\n",
-    "for name, mod in model_q.named_modules():\n",
-    "    if isinstance(mod, pytorch_quantization.nn.TensorQuantizer):\n",
-    "        print(f\"{name:80} {mod}\")\n",
-    "        count += 1\n",
-    "print(f\"{count} TensorQuantizers found in model\")\n",
-    "model_q.save_pretrained(\"roberta-in-bert-trained-quantized\")"
+    "# count = 0\n",
+    "# for name, mod in model_q.named_modules():\n",
+    "#     if isinstance(mod, pytorch_quantization.nn.TensorQuantizer):\n",
+    "#         print(f\"{name:80} {mod}\")\n",
+    "#         count += 1\n",
+    "# print(f\"{count} TensorQuantizers found in model\")\n",
+    "# model_q.save_pretrained(\"roberta-in-bert-trained-quantized\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Quantization aware training\n",
+    "### Quantization Aware Training (QAT)\n",
     "\n",
-    "The query aware training is not a mandatory step, but highly recommended to get the best accuracy. Basically we will redo the training with the quantization enabled."
+    "The query aware training is not a mandatory step, but **highly** recommended to get the best accuracy. Basically we will redo the training with the quantization enabled and a low learning rate."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {
     "id": "imY1oC3SIrJf"
    },
@@ -3122,1141 +1652,622 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "loading configuration file roberta-in-bert-trained-quantized/config.json\n",
-      "Model config QDQBertConfig {\n",
-      "  \"_name_or_path\": \"roberta-in-bert-trained\",\n",
-      "  \"architectures\": [\n",
-      "    \"QDQBertForSequenceClassification\"\n",
-      "  ],\n",
-      "  \"attention_probs_dropout_prob\": 0.1,\n",
-      "  \"bos_token_id\": 0,\n",
-      "  \"classifier_dropout\": null,\n",
-      "  \"eos_token_id\": 2,\n",
-      "  \"hidden_act\": \"gelu\",\n",
-      "  \"hidden_dropout_prob\": 0.1,\n",
-      "  \"hidden_size\": 768,\n",
-      "  \"id2label\": {\n",
-      "    \"0\": \"LABEL_0\",\n",
-      "    \"1\": \"LABEL_1\",\n",
-      "    \"2\": \"LABEL_2\"\n",
-      "  },\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"label2id\": {\n",
-      "    \"LABEL_0\": 0,\n",
-      "    \"LABEL_1\": 1,\n",
-      "    \"LABEL_2\": 2\n",
-      "  },\n",
-      "  \"layer_norm_eps\": 1e-05,\n",
-      "  \"max_position_embeddings\": 514,\n",
-      "  \"model_type\": \"qdqbert\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 12,\n",
-      "  \"pad_token_id\": 1,\n",
-      "  \"position_embedding_type\": \"absolute\",\n",
-      "  \"problem_type\": \"single_label_classification\",\n",
-      "  \"torch_dtype\": \"float32\",\n",
-      "  \"transformers_version\": \"4.13.0.dev0\",\n",
-      "  \"type_vocab_size\": 1,\n",
-      "  \"use_cache\": true,\n",
-      "  \"vocab_size\": 50265\n",
-      "}\n",
-      "\n",
-      "loading weights file roberta-in-bert-trained-quantized/pytorch_model.bin\n",
-      "I1201 19:11:27.208793 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.209632 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.210307 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.211149 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.223644 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.224489 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.225447 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.226618 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.241246 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.242141 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.243034 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.243859 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.244722 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.245520 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.246686 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.247766 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.264419 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.265261 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.265923 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.266575 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.267451 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.268172 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.287400 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.288151 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.288789 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.289382 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.309565 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.310387 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.311098 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.311751 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.312860 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.313377 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.324157 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.324974 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.325625 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.326216 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.336715 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.337255 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.337689 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.338090 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.347661 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.348753 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.349757 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.350883 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.351833 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.352656 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.353345 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.353943 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.364943 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.365485 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.365902 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.366509 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.367229 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.368165 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.388085 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.388865 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.389516 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.390258 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.410784 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.411631 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.412297 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.413084 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.414086 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.414799 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.425738 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.426646 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.427272 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.427896 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.438486 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.439234 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n"
+      "[INFO|trainer.py:437] 2021-12-06 18:34:07,721 >> Using amp half precision backend\n"
      ]
     },
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "I1201 19:11:27.439879 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.440475 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.450418 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.451315 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.451903 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.452303 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.452749 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.453151 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.453549 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.453957 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.465221 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.465954 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.466644 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.467072 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.468285 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.468727 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.489801 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.490633 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.491233 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.491766 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.514694 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.515455 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.516121 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.516760 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.518003 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.518579 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.528168 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.528827 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.529607 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.530612 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.540609 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.541207 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.541832 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.542459 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.552270 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.552871 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.553499 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.554122 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.554818 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.555415 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.556003 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.556731 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.569805 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.570413 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.570830 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.571321 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.572377 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.572752 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.592881 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.593365 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.593739 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.594096 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.614654 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.615172 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.615548 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.615923 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.616773 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.617163 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.627428 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.628115 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.628840 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.629502 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.638837 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.639514 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.640233 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.640931 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.650447 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.651207 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.651915 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.652822 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.653721 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.654456 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.655095 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.655695 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.666741 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.667589 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.668479 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.669285 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.670184 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.670979 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.691772 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.692621 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n"
+      "{'eval_loss': 0.4492516815662384, 'eval_accuracy': 0.8271013754457464, 'eval_runtime': 46.2281, 'eval_samples_per_second': 212.317, 'eval_steps_per_second': 3.331}\n",
+      "{'eval_loss': 0.4492516815662384, 'eval_accuracy': 0.8271013754457464, 'eval_runtime': 46.2281, 'eval_samples_per_second': 212.317, 'eval_steps_per_second': 3.331}\n",
+      "{'loss': 0.4752, 'learning_rate': 9.188396349413299e-07, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.4362102150917053, 'eval_accuracy': 0.8346408558329088, 'eval_runtime': 46.4717, 'eval_samples_per_second': 211.204, 'eval_steps_per_second': 3.314, 'epoch': 0.08}\n",
+      "{'loss': 0.4643, 'learning_rate': 8.373533246414604e-07, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.42539361119270325, 'eval_accuracy': 0.8370860927152318, 'eval_runtime': 46.5627, 'eval_samples_per_second': 210.791, 'eval_steps_per_second': 3.307, 'epoch': 0.16}\n",
+      "{'loss': 0.4509, 'learning_rate': 7.558670143415907e-07, 'epoch': 0.24}\n",
+      "{'eval_loss': 0.42584264278411865, 'eval_accuracy': 0.8367804381049414, 'eval_runtime': 46.5106, 'eval_samples_per_second': 211.027, 'eval_steps_per_second': 3.311, 'epoch': 0.24}\n",
+      "{'loss': 0.4454, 'learning_rate': 6.743807040417211e-07, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.427680641412735, 'eval_accuracy': 0.8410596026490066, 'eval_runtime': 46.5186, 'eval_samples_per_second': 210.991, 'eval_steps_per_second': 3.311, 'epoch': 0.33}\n",
+      "{'loss': 0.4486, 'learning_rate': 5.928943937418514e-07, 'epoch': 0.41}\n",
+      "{'eval_loss': 0.419879287481308, 'eval_accuracy': 0.8401426388181356, 'eval_runtime': 46.4807, 'eval_samples_per_second': 211.163, 'eval_steps_per_second': 3.313, 'epoch': 0.41}\n",
+      "{'loss': 0.444, 'learning_rate': 5.114895697522818e-07, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.42938971519470215, 'eval_accuracy': 0.8374936321956189, 'eval_runtime': 46.467, 'eval_samples_per_second': 211.225, 'eval_steps_per_second': 3.314, 'epoch': 0.49}\n",
+      "{'loss': 0.442, 'learning_rate': 4.30003259452412e-07, 'epoch': 0.57}\n",
+      "{'eval_loss': 0.4225366413593292, 'eval_accuracy': 0.8381049414161997, 'eval_runtime': 46.5078, 'eval_samples_per_second': 211.04, 'eval_steps_per_second': 3.311, 'epoch': 0.57}\n",
+      "{'loss': 0.4463, 'learning_rate': 3.485169491525424e-07, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.423688679933548, 'eval_accuracy': 0.8393275598573612, 'eval_runtime': 46.4966, 'eval_samples_per_second': 211.09, 'eval_steps_per_second': 3.312, 'epoch': 0.65}\n",
+      "{'loss': 0.4488, 'learning_rate': 2.671121251629727e-07, 'epoch': 0.73}\n",
+      "{'eval_loss': 0.4213014543056488, 'eval_accuracy': 0.8401426388181356, 'eval_runtime': 46.5212, 'eval_samples_per_second': 210.979, 'eval_steps_per_second': 3.31, 'epoch': 0.73}\n",
+      "{'loss': 0.4354, 'learning_rate': 1.8562581486310303e-07, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.4192813038825989, 'eval_accuracy': 0.8407539480387163, 'eval_runtime': 48.5842, 'eval_samples_per_second': 202.02, 'eval_steps_per_second': 3.17, 'epoch': 0.81}\n",
+      "{'loss': 0.4344, 'learning_rate': 1.0422099087353327e-07, 'epoch': 0.9}\n",
+      "{'eval_loss': 0.41954925656318665, 'eval_accuracy': 0.8381049414161997, 'eval_runtime': 48.5554, 'eval_samples_per_second': 202.14, 'eval_steps_per_second': 3.172, 'epoch': 0.9}\n",
+      "{'loss': 0.436, 'learning_rate': 2.2734680573663627e-08, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.41829705238342285, 'eval_accuracy': 0.8401426388181356, 'eval_runtime': 46.6717, 'eval_samples_per_second': 210.299, 'eval_steps_per_second': 3.3, 'epoch': 0.98}\n",
+      "{'train_runtime': 4966.1274, 'train_samples_per_second': 79.076, 'train_steps_per_second': 2.471, 'train_loss': 0.4474433752206656, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.427680641412735, 'eval_accuracy': 0.8410596026490066, 'eval_runtime': 46.5232, 'eval_samples_per_second': 210.97, 'eval_steps_per_second': 3.31, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.427680641412735, 'eval_accuracy': 0.8410596026490066, 'eval_runtime': 46.5232, 'eval_samples_per_second': 210.97, 'eval_steps_per_second': 3.31, 'epoch': 1.0}\n"
      ]
-    },
+    }
+   ],
+   "source": [
+    "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized\", num_labels=num_labels)\n",
+    "model_q = model_q.cuda()\n",
+    "\n",
+    "args.learning_rate /= 10\n",
+    "trainer = Trainer(\n",
+    "    model_q,\n",
+    "    args,\n",
+    "    train_dataset=encoded_dataset[\"train\"],\n",
+    "    eval_dataset=encoded_dataset[validation_key],\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")\n",
+    "transformers.logging.set_verbosity_error()\n",
+    "print(trainer.evaluate())\n",
+    "trainer.train()\n",
+    "print(trainer.evaluate())\n",
+    "model_q.save_pretrained(\"roberta-in-bert-trained-quantized-bis\")\n",
+    "del model_q\n",
+    "del trainer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Benchmark"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Export a `QDQ Pytorch` model on `ONNX`, we need to enable fake quantization mode from Pytorch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "I1201 19:11:27.693244 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.693841 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.714617 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.715979 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.716634 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.717667 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.719031 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.719666 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.731079 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.731843 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.732547 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.733175 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.746136 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.746871 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.747591 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.748189 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.760656 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.761277 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.762013 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.762758 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.763410 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.763994 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.764589 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.765226 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.779591 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.780524 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.781319 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.781931 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.782832 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.783449 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.805895 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.806693 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.807322 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.807956 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.834656 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.835458 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.836114 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.836716 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.838033 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.838794 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.849754 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.850384 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.851008 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.851848 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.866665 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.867449 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.868165 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.868940 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.878483 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.879086 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.879875 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.880830 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.881797 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.882521 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.883174 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.883836 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.895553 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.895962 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.896406 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.896936 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.898256 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.898759 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.919639 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.920164 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.920756 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.921738 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.942070 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.942806 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.943764 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.944607 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.945640 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.946361 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.958386 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.958937 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.959405 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.959839 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.969992 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.970374 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.970793 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.971190 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.980629 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n"
-     ]
-    },
+      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization/nn/modules/tensor_quantizer.py:285: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  inputs, amax.item() / bound, 0,\n",
+      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization/nn/modules/tensor_quantizer.py:291: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  quant_dim = list(amax.shape).index(list(amax_sequeeze.shape)[0])\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = encoded_dataset[\"train\"][0: 3]\n",
+    "input_torch = {k: torch.tensor(v, dtype=torch.long, device=\"cuda\") for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
+    "\n",
+    "from pytorch_quantization.nn import TensorQuantizer\n",
+    "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized-bis\", num_labels=num_labels)\n",
+    "model_q = model_q.cuda()\n",
+    "TensorQuantizer.use_fb_fake_quant = True\n",
+    "convert_to_onnx(model_q, output_path=\"model_q.onnx\", inputs_pytorch=input_torch)\n",
+    "TensorQuantizer.use_fb_fake_quant = False\n",
+    "del model_q"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "#### Convert `ONNX` graph to `TensorRT` engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "engine = build_engine(\n",
+    "    runtime=runtime,\n",
+    "    onnx_file_path=\"model_q.onnx\",\n",
+    "    logger=trt_logger,\n",
+    "    min_shape=(batch_size, max_seq_len),\n",
+    "    optimal_shape=(batch_size, max_seq_len),\n",
+    "    max_shape=(batch_size, max_seq_len),\n",
+    "    workspace_size=10000 * 1024 * 1024,\n",
+    "    fp16=False,\n",
+    "    int8=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "#### Prepare input and output buffer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "profile_index = 0\n",
+    "np_input = {\"input_ids\": np.random.randint(1, 10000, size=(batch_size, max_seq_len), dtype=np.int64),\n",
+    " \"attention_mask\": np.ones(shape=(batch_size, max_seq_len), dtype=np.int64),\n",
+    "            }\n",
+    "\n",
+    "stream: Stream = pycuda.driver.Stream()\n",
+    "\n",
+    "context: IExecutionContext = engine.create_execution_context()\n",
+    "context.set_optimization_profile_async(profile_index=profile_index, stream_handle=stream.handle)\n",
+    "input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)  # type: List[int], List[int]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "#### Inference on `TensorRT`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "I1201 19:11:27.980991 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.981393 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.981833 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.982313 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.982766 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.983162 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.983512 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.993079 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:27.993498 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:27.993848 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.994362 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:27.995240 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:27.995689 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.016281 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.016845 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.017403 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.017889 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.037999 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.038670 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.039304 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.039771 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.040842 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.041286 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.051465 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.052168 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.053006 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.053862 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.069291 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.069987 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.070711 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.071384 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.080609 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.081198 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.081861 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.082520 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.083215 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.084009 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.084729 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.085382 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.097800 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.099022 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.100077 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.100697 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.101861 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.102420 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.124801 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.125305 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.125720 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.126162 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.148860 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.149511 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.149907 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.150279 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.151454 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.151886 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.162329 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.162862 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.163242 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.163593 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.173374 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.173928 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.174362 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.174846 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.186318 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.186879 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.187222 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.187582 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.188061 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.188420 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.188758 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.189087 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.199641 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.200148 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.200556 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.200956 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.201748 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.202094 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.225237 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.225745 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.226214 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.226853 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.249388 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n"
+      "[array([[ 0.1358001 , -1.4377486 ,  1.3672757 ],\n",
+      "       [-0.16206698, -1.149481  ,  1.4266016 ],\n",
+      "       [ 0.0163878 , -1.0470941 ,  1.2498031 ],\n",
+      "       [-0.21079333, -0.91275144,  1.2614312 ],\n",
+      "       [ 0.13416213, -1.2132894 ,  1.0915226 ],\n",
+      "       [-0.23387383, -0.6663823 ,  1.0708152 ],\n",
+      "       [-0.4426742 , -0.64095986,  0.6767337 ],\n",
+      "       [-0.39520252, -0.6310587 ,  1.162437  ],\n",
+      "       [-0.11956491, -0.9094458 ,  1.2330313 ],\n",
+      "       [-0.34652767, -0.56745625,  1.1321819 ],\n",
+      "       [-0.3788384 , -0.9477967 ,  1.3850961 ],\n",
+      "       [-1.079162  ,  0.04613969,  0.9176692 ],\n",
+      "       [-0.12555303, -0.8791798 ,  1.2635291 ],\n",
+      "       [-0.12463601, -0.63906515,  0.95351076],\n",
+      "       [ 0.31858096, -0.410717  ,  0.69519377],\n",
+      "       [ 0.07587517, -0.58817637,  0.82071406],\n",
+      "       [ 0.1137608 , -0.8322618 ,  0.6675602 ],\n",
+      "       [-0.50839895, -0.8443974 ,  1.462322  ],\n",
+      "       [-0.14658742, -1.1222454 ,  1.3913041 ],\n",
+      "       [ 0.05990895, -1.4671483 ,  1.5297441 ],\n",
+      "       [ 0.17553274, -0.26642302,  0.67778957],\n",
+      "       [ 0.14809372, -1.3270702 ,  1.1495501 ],\n",
+      "       [-0.1042301 , -0.8665275 ,  0.90043837],\n",
+      "       [-0.78590935, -0.6129427 ,  0.9732029 ],\n",
+      "       [-0.19332369, -0.8912125 ,  1.1381842 ],\n",
+      "       [ 0.50638545, -0.9965472 ,  0.69867384],\n",
+      "       [-0.0973227 , -0.8511242 ,  1.2328701 ],\n",
+      "       [ 0.16307044, -1.1843398 ,  1.437165  ],\n",
+      "       [-0.6260487 , -0.5227167 ,  1.247594  ],\n",
+      "       [-0.30106562, -0.6723875 ,  1.1667051 ],\n",
+      "       [ 0.01060311, -1.1707903 ,  1.3197892 ],\n",
+      "       [-0.22743034, -0.99327207,  0.9541633 ]], dtype=float32)]\n"
      ]
-    },
+    }
+   ],
+   "source": [
+    "tensorrt_output = infer_tensorrt(\n",
+    "    context=context,\n",
+    "    host_inputs=np_input,\n",
+    "    input_binding_idxs=input_binding_idxs,\n",
+    "    output_binding_idxs=output_binding_idxs,\n",
+    "    stream=stream,\n",
+    ")\n",
+    "print(tensorrt_output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "#### Conversion with `trtexec` (command line approach)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!/usr/src/tensorrt/bin/trtexec --onnx=model_q.onnx --shapes=input_ids:32x256,attention_mask:32x256 --int8 --workspace=6000  --saveEngine=\"test.plan\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Method 2: use a dedicated QDQ model\n",
+    "\n",
+    "In method 2, the idea is to take the source code of a specific model and add manually in the source code `QDQ` nodes. That way, quantization will work out of the box. Even if `Bert` has many variations, it seems that very few of them are really used. Hugging Face transformers library include `Bert` model.\n",
+    "Our library offer a dedicated implementation of `Roberta`.\n",
+    "\n",
+    "To adapt another architecture, you need to:\n",
+    "\n",
+    "* replaced linear layers with their quantized version\n",
+    "* replace operations not supported out of the box by TensorRT by a similar code supporting the operation.\n",
+    "\n",
+    "> it's not a complex process, but it requires some knowledge of `ONNX` supported operations and `TensorRT` framework\n",
+    "\n",
+    "The process below is a bit simpler than the method 1:\n",
+    "\n",
+    "* finetune the QDQ model on the task (Quantization Aware Training)\n",
+    "* calibrate\n",
+    "* Quantization Aware training (QAT)\n",
+    "\n",
+    "> you may skip step 1/ if you want\n",
+    "\n",
+    "### Fine tuning the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "I1201 19:11:28.249994 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.250553 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.251643 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.253252 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.253765 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.265286 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.266306 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.267012 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.267527 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.278982 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.279747 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.280377 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.280992 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.292645 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.293354 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.294006 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.294641 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.295266 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.295871 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.296438 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.297021 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.308347 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.309117 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.309940 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.310598 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.311483 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.312092 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.334968 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.335727 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.336348 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.336934 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.360406 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.361209 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.362031 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.362898 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.363797 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.364415 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.375894 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.376536 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.377102 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.377916 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.390349 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.390866 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.391533 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.392543 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.403455 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.403934 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.404580 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.405518 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.406105 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.406684 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.407144 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.407640 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.417722 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.418179 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.418803 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.419415 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.420697 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.421152 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.444371 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.445104 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.445688 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.446305 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.469518 140057592104768 _utils.py:72] Input is fake quantized to 8 bits in QuantLinear with axis None!\n",
-      "I1201 19:11:28.470333 140057592104768 _utils.py:75] Weight is fake quantized to 8 bits in QuantLinear with axis (0,)!\n",
-      "I1201 19:11:28.470999 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.471604 140057592104768 tensor_quantizer.py:105] Creating Max calibrator\n",
-      "I1201 19:11:28.472492 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "I1201 19:11:28.473104 140057592104768 tensor_quantizer.py:101] Creating histogram calibrator\n",
-      "Some weights of the model checkpoint at roberta-in-bert-trained-quantized were not used when initializing QDQBertForSequenceClassification: ['bert.encoder.layer.2.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.4.output.dense._input_quantizer._amax', 'bert.encoder.layer.5.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.3.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.5.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.1.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.8.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.5.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.1.output.dense._weight_quantizer._amax', 'bert.encoder.layer.10.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.5.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.10.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.2.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.5.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.6.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.7.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.3.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.5.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.5.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.4.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.3.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.2.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.6.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.2.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.9.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.10.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.1.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.9.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.2.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.0.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.5.output.dense._input_quantizer._amax', 'bert.encoder.layer.8.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.6.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.1.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.3.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.3.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.6.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.8.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.8.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.6.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.3.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.output.add_local_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.4.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.9.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.10.output.dense._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.3.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.3.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.1.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.1.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.11.output.dense._input_quantizer._amax', 'bert.encoder.layer.7.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.1.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.0.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.1.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.4.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.0.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.11.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.9.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.0.output.dense._input_quantizer._amax', 'bert.encoder.layer.0.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.10.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.10.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.4.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.7.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.5.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.6.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.10.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.6.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.1.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.4.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.1.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.9.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.7.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.5.output.dense._weight_quantizer._amax', 'bert.encoder.layer.2.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.0.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.5.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.9.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.0.output.dense._weight_quantizer._amax', 'bert.encoder.layer.1.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.8.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.8.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.11.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.4.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.9.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.6.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.1.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.7.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.4.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.8.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.2.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.8.output.dense._input_quantizer._amax', 'bert.encoder.layer.9.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.6.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.10.output.dense._input_quantizer._amax', 'bert.encoder.layer.6.output.add_local_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.9.output.dense._weight_quantizer._amax', 'bert.encoder.layer.5.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.3.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.6.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.2.output.add_local_input_quantizer._amax', 'bert.encoder.layer.3.output.add_local_input_quantizer._amax', 'bert.encoder.layer.3.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.6.output.dense._weight_quantizer._amax', 'bert.encoder.layer.7.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.11.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.1.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.0.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.3.output.dense._weight_quantizer._amax', 'bert.encoder.layer.9.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.9.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.1.output.add_local_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.3.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.7.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.0.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.8.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.2.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.8.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.8.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.output.dense._input_quantizer._amax', 'bert.encoder.layer.0.output.add_local_input_quantizer._amax', 'bert.encoder.layer.6.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.1.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.5.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.6.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.3.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.1.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.4.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.5.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.7.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.5.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.0.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.5.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.10.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.1.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.5.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.3.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.11.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.6.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.8.output.dense._weight_quantizer._amax', 'bert.encoder.layer.5.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.9.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.11.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.0.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.4.output.dense._weight_quantizer._amax', 'bert.encoder.layer.10.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.6.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.4.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.9.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.11.attention.self.key._input_quantizer._amax', 'bert.encoder.layer.8.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.1.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.6.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.0.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.matmul_v_input_quantizer._amax', 'bert.encoder.layer.3.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.8.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.10.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.2.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.1.attention.self.key._weight_quantizer._amax', 'bert.encoder.layer.1.output.dense._input_quantizer._amax', 'bert.encoder.layer.7.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.5.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.3.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.7.output.dense._weight_quantizer._amax', 'bert.encoder.layer.3.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.2.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.5.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.8.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.4.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.7.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.2.output.dense._input_quantizer._amax', 'bert.encoder.layer.7.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.11.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.3.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.4.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.9.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.8.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.6.attention.output.add_local_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.query._input_quantizer._amax', 'bert.encoder.layer.11.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.10.output.add_local_input_quantizer._amax', 'bert.encoder.layer.0.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.11.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.7.attention.self.query._weight_quantizer._amax', 'bert.encoder.layer.6.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.2.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.matmul_k_input_quantizer._amax', 'bert.encoder.layer.9.attention.self.value._input_quantizer._amax', 'bert.encoder.layer.2.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.0.attention.self.matmul_a_input_quantizer._amax', 'bert.encoder.layer.6.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.4.attention.output.dense._input_quantizer._amax', 'bert.encoder.layer.4.attention.output.dense._weight_quantizer._amax', 'bert.encoder.layer.4.output.add_local_input_quantizer._amax', 'bert.encoder.layer.7.attention.self.matmul_q_input_quantizer._amax', 'bert.encoder.layer.2.output.dense._weight_quantizer._amax', 'bert.encoder.layer.7.output.add_local_input_quantizer._amax', 'bert.encoder.layer.11.output.dense._weight_quantizer._amax', 'bert.encoder.layer.7.attention.output.add_residual_input_quantizer._amax', 'bert.encoder.layer.3.intermediate.dense._input_quantizer._amax', 'bert.encoder.layer.10.attention.self.value._weight_quantizer._amax', 'bert.encoder.layer.0.intermediate.dense._weight_quantizer._amax', 'bert.encoder.layer.11.attention.self.value._input_quantizer._amax']\n",
-      "- This IS expected if you are initializing QDQBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing QDQBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+      "[INFO|trainer.py:437] 2021-12-06 20:38:02,464 >> Using amp half precision backend\n"
      ]
     },
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "All the weights of QDQBertForSequenceClassification were initialized from the model checkpoint at roberta-in-bert-trained-quantized.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use QDQBertForSequenceClassification for predictions without further training.\n",
-      "Using amp half precision backend\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n"
+      "{'loss': 0.6886, 'learning_rate': 9.188396349413299e-06, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.4678966999053955, 'eval_accuracy': 0.8171166581762608, 'eval_runtime': 18.7354, 'eval_samples_per_second': 523.874, 'eval_steps_per_second': 8.22, 'epoch': 0.08}\n",
+      "{'loss': 0.5021, 'learning_rate': 8.373533246414604e-06, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.4271945059299469, 'eval_accuracy': 0.8333163525216505, 'eval_runtime': 18.5466, 'eval_samples_per_second': 529.209, 'eval_steps_per_second': 8.303, 'epoch': 0.16}\n",
+      "{'loss': 0.4682, 'learning_rate': 7.558670143415907e-06, 'epoch': 0.24}\n",
+      "{'eval_loss': 0.4240091145038605, 'eval_accuracy': 0.8358634742740703, 'eval_runtime': 18.6916, 'eval_samples_per_second': 525.101, 'eval_steps_per_second': 8.239, 'epoch': 0.24}\n",
+      "{'loss': 0.4491, 'learning_rate': 6.743807040417211e-06, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.38295766711235046, 'eval_accuracy': 0.8523688232297504, 'eval_runtime': 18.6766, 'eval_samples_per_second': 525.524, 'eval_steps_per_second': 8.246, 'epoch': 0.33}\n",
+      "{'loss': 0.4292, 'learning_rate': 5.9289439374185145e-06, 'epoch': 0.41}\n",
+      "{'eval_loss': 0.3819591999053955, 'eval_accuracy': 0.8519612837493632, 'eval_runtime': 19.1793, 'eval_samples_per_second': 511.75, 'eval_steps_per_second': 8.029, 'epoch': 0.41}\n",
+      "{'loss': 0.4188, 'learning_rate': 5.114080834419818e-06, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.3905084729194641, 'eval_accuracy': 0.8507386653082017, 'eval_runtime': 18.5694, 'eval_samples_per_second': 528.559, 'eval_steps_per_second': 8.293, 'epoch': 0.49}\n",
+      "{'loss': 0.4171, 'learning_rate': 4.30003259452412e-06, 'epoch': 0.57}\n",
+      "{'eval_loss': 0.36459046602249146, 'eval_accuracy': 0.8601120733571065, 'eval_runtime': 18.5686, 'eval_samples_per_second': 528.579, 'eval_steps_per_second': 8.294, 'epoch': 0.57}\n",
+      "{'loss': 0.4118, 'learning_rate': 3.4851694915254244e-06, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.35626235604286194, 'eval_accuracy': 0.8616403464085584, 'eval_runtime': 18.5178, 'eval_samples_per_second': 530.029, 'eval_steps_per_second': 8.316, 'epoch': 0.65}\n",
+      "{'loss': 0.4006, 'learning_rate': 2.670306388526728e-06, 'epoch': 0.73}\n",
+      "{'eval_loss': 0.3605223596096039, 'eval_accuracy': 0.8653082017320428, 'eval_runtime': 18.6003, 'eval_samples_per_second': 527.68, 'eval_steps_per_second': 8.279, 'epoch': 0.73}\n",
+      "{'loss': 0.3936, 'learning_rate': 1.8570730117340288e-06, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.3559686243534088, 'eval_accuracy': 0.8653082017320428, 'eval_runtime': 18.5309, 'eval_samples_per_second': 529.656, 'eval_steps_per_second': 8.31, 'epoch': 0.81}\n",
+      "{'loss': 0.3945, 'learning_rate': 1.0422099087353325e-06, 'epoch': 0.9}\n",
+      "{'eval_loss': 0.3518819212913513, 'eval_accuracy': 0.8659195109526235, 'eval_runtime': 18.5189, 'eval_samples_per_second': 529.998, 'eval_steps_per_second': 8.316, 'epoch': 0.9}\n",
+      "{'loss': 0.3977, 'learning_rate': 2.2734680573663624e-07, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.34959253668785095, 'eval_accuracy': 0.8677534386143657, 'eval_runtime': 18.5328, 'eval_samples_per_second': 529.602, 'eval_steps_per_second': 8.31, 'epoch': 0.98}\n",
+      "{'train_runtime': 2665.1824, 'train_samples_per_second': 147.345, 'train_steps_per_second': 4.605, 'train_loss': 0.44651927413343606, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.34959253668785095, 'eval_accuracy': 0.8677534386143657, 'eval_runtime': 18.4913, 'eval_samples_per_second': 530.789, 'eval_steps_per_second': 8.328, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.34959253668785095, 'eval_accuracy': 0.8677534386143657, 'eval_runtime': 18.4913, 'eval_samples_per_second': 530.789, 'eval_steps_per_second': 8.328, 'epoch': 1.0}\n"
      ]
-    },
+    }
+   ],
+   "source": [
+    "model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
+    "model_roberta = model_roberta.cuda()\n",
+    "\n",
+    "args.learning_rate = 1e-5\n",
+    "trainer = Trainer(\n",
+    "    model_roberta,\n",
+    "    args,\n",
+    "    train_dataset=encoded_dataset[\"train\"],\n",
+    "    eval_dataset=encoded_dataset[validation_key],\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")\n",
+    "transformers.logging.set_verbosity_error()\n",
+    "trainer.train()\n",
+    "print(trainer.evaluate())\n",
+    "# {'eval_loss': 0.3559744358062744, 'eval_accuracy': 0.8655119714722364, 'eval_runtime': 19.6678, 'eval_samples_per_second': 499.04, 'eval_steps_per_second': 7.83, 'epoch': 0.98}\n",
+    "trainer.save_model(\"roberta-model\")\n",
+    "del model_roberta\n",
+    "del trainer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "### Calibration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
     {
      "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='308' max='154' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [154/154 07:31]\n",
-       "    </div>\n",
-       "    "
-      ],
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e696905fbfdf4a149cb2437482b20cf1",
+       "version_major": 2,
+       "version_minor": 0
+      },
       "text/plain": [
-       "<IPython.core.display.HTML object>"
+       "0it [00:00, ?it/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "The following columns in the training set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running training *****\n",
-      "  Num examples = 392702\n",
-      "  Num Epochs = 1\n",
-      "  Instantaneous batch size per device = 32\n",
-      "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
-      "  Gradient Accumulation steps = 1\n",
-      "  Total optimization steps = 12272\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.5553710460662842, 'eval_accuracy': 0.7799286805909322, 'eval_runtime': 46.6334, 'eval_samples_per_second': 210.472, 'eval_steps_per_second': 3.302}\n"
-     ]
-    },
     {
      "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='12272' max='12272' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [12272/12272 1:22:46, Epoch 1/1]\n",
-       "    </div>\n",
-       "    <table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: left;\">\n",
-       "      <th>Step</th>\n",
-       "      <th>Training Loss</th>\n",
-       "      <th>Validation Loss</th>\n",
-       "      <th>Accuracy</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>1000</td>\n",
-       "      <td>0.581400</td>\n",
-       "      <td>0.505601</td>\n",
-       "      <td>0.805807</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2000</td>\n",
-       "      <td>0.542400</td>\n",
-       "      <td>0.481971</td>\n",
-       "      <td>0.811105</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3000</td>\n",
-       "      <td>0.510800</td>\n",
-       "      <td>0.469823</td>\n",
-       "      <td>0.823637</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>4000</td>\n",
-       "      <td>0.494000</td>\n",
-       "      <td>0.459618</td>\n",
-       "      <td>0.821905</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>5000</td>\n",
-       "      <td>0.482700</td>\n",
-       "      <td>0.418851</td>\n",
-       "      <td>0.837596</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>6000</td>\n",
-       "      <td>0.471200</td>\n",
-       "      <td>0.417829</td>\n",
-       "      <td>0.836373</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>7000</td>\n",
-       "      <td>0.460700</td>\n",
-       "      <td>0.431540</td>\n",
-       "      <td>0.834947</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>8000</td>\n",
-       "      <td>0.460100</td>\n",
-       "      <td>0.402023</td>\n",
-       "      <td>0.847376</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>9000</td>\n",
-       "      <td>0.457700</td>\n",
-       "      <td>0.396712</td>\n",
-       "      <td>0.846052</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>10000</td>\n",
-       "      <td>0.435400</td>\n",
-       "      <td>0.398412</td>\n",
-       "      <td>0.846460</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>11000</td>\n",
-       "      <td>0.436800</td>\n",
-       "      <td>0.396119</td>\n",
-       "      <td>0.848701</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>12000</td>\n",
-       "      <td>0.434900</td>\n",
-       "      <td>0.398557</td>\n",
-       "      <td>0.850229</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table><p>"
-      ],
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e1369f4ac82f4f90b2194d209dc1c8cd",
+       "version_major": 2,
+       "version_minor": 0
+      },
       "text/plain": [
-       "<IPython.core.display.HTML object>"
+       "  0%|          | 0/4 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-1000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-1000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-1000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-1000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-1000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-2000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-2000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-2000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-2000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-2000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-3000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-3000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-3000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-3000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-3000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-4000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-4000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-4000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-4000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-4000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-5000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-5000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-5000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-5000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-5000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-6000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-6000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-6000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-6000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-6000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-7000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-7000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-7000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-7000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-7000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-8000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-8000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-8000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-8000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-8000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-9000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-9000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-9000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-9000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-9000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-10000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-10000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-10000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-10000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-10000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-11000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-11000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-11000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-11000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-11000/special_tokens_map.json\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n",
-      "Saving model checkpoint to roberta-base-finetuned-mnli/checkpoint-12000\n",
-      "Configuration saved in roberta-base-finetuned-mnli/checkpoint-12000/config.json\n",
-      "Model weights saved in roberta-base-finetuned-mnli/checkpoint-12000/pytorch_model.bin\n",
-      "tokenizer config file saved in roberta-base-finetuned-mnli/checkpoint-12000/tokenizer_config.json\n",
-      "Special tokens file saved in roberta-base-finetuned-mnli/checkpoint-12000/special_tokens_map.json\n",
-      "\n",
-      "\n",
-      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
-      "\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading best model from roberta-base-finetuned-mnli/checkpoint-12000 (score: 0.8502292409577178).\n",
-      "W1201 20:35:01.955523 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.956298 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.959111 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.960055 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.961468 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.962328 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.963392 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.964326 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.965610 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.966427 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.969145 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.970091 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.971021 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.971580 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.973517 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.974354 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.977516 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.978431 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.979310 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.980026 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.0.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.981464 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.982098 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.983325 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.984021 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.985316 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.986423 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.987096 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.987734 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.988366 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.988989 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.990516 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.991376 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.992448 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.993382 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.995410 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.996055 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.997875 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.998544 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:01.999530 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.000184 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.1.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.001658 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.002353 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.004154 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.004740 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.005739 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.006352 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.007008 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.007576 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.008145 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.009447 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.010714 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.011306 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.012108 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.012622 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.014365 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.015107 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.017004 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.017748 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.output.dense._weight_quantizer: Overwriting amax.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "W1201 20:35:02.018628 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.019512 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.2.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.021052 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.021683 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.023229 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.024051 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.025203 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.025818 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.026520 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.027173 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.027965 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.028816 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.030057 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.030714 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.031604 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.032257 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.034212 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.035150 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.037409 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.038117 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.039025 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.039702 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.3.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.041222 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.041909 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.043179 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.043793 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.044975 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.045604 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.046368 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.047130 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.047795 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.048471 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.050607 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.051483 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.052273 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.053120 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.055149 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.055891 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.057919 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.058685 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.059737 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.060414 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.4.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.062059 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.062887 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.064355 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.065141 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.066732 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.067530 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.068627 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.069675 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.070315 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.071181 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.072673 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.073366 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.074441 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.075135 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.077220 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.078051 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.intermediate.dense._weight_quantizer: Overwriting amax.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "W1201 20:35:02.080285 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.081100 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.082259 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.082955 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.5.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.084679 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.085377 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.086830 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.087531 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.088868 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.089610 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.090454 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.091369 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.092094 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.093008 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.094101 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.094680 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.095411 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.096111 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.098433 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.099215 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.101495 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.102191 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.103269 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.104236 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.6.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.106216 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.107016 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.108289 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.109015 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.110199 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.110914 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.111627 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.112400 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.113051 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.113741 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.115241 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.116059 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.117117 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.117922 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.120249 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.121058 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.123253 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.124119 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.125084 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.125741 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.7.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.127210 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.127854 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.128997 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.129669 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.131155 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.131786 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.132460 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.132992 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.133857 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.134664 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.136404 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.137067 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.137895 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.138522 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.140558 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.intermediate.dense._input_quantizer: Overwriting amax.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "W1201 20:35:02.141115 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.143184 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.143835 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.144592 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.145140 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.8.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.146503 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.147159 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.148222 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.148817 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.149873 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.150564 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.151132 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.151804 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.152360 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.153022 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.154481 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.155050 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.156115 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.156681 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.158971 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.159622 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.161546 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.162128 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.163292 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.163882 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.9.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.165225 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.165837 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.166918 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.167514 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.168765 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.169459 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.170163 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.170852 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.171326 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.172083 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.173614 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.174386 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.175272 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.175909 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.177917 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.178691 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.181160 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.181823 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.182794 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.183488 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.10.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.184913 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.query._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.185632 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.query._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.186927 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.key._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.187760 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.key._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.189118 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.value._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.189894 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.value._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.190741 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.matmul_q_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.191475 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.matmul_k_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.192274 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.matmul_v_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.193039 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.self.matmul_a_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.194473 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.195123 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.196365 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.output.add_local_input_quantizer: Overwriting amax.\n"
-     ]
-    },
+    }
+   ],
+   "source": [
+    "\n",
+    "input_desc = QuantDescriptor(num_bits=8, calib_method=\"histogram\")\n",
+    "# below we do per-channel quantization for weights, set axis to None to get a per tensor calibration\n",
+    "weight_desc = QuantDescriptor(num_bits=8, axis=(0,))\n",
+    "quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)\n",
+    "quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)\n",
+    "\n",
+    "# keep it on CPU\n",
+    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\"roberta-model\")\n",
+    "calibrate(model=model_roberta_q, encoded_dataset=encoded_dataset)\n",
+    "\n",
+    "\n",
+    "model_roberta_q.save_pretrained(\"roberta-trained-quantized\")\n",
+    "del model_roberta_q\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "### Quantization Aware Training (QAT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "W1201 20:35:02.197219 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.attention.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.199527 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.intermediate.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.200487 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.intermediate.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.202985 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.output.dense._input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.203737 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.output.dense._weight_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.204701 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.output.add_local_input_quantizer: Overwriting amax.\n",
-      "W1201 20:35:02.205333 140057592104768 tensor_quantizer.py:402] bert.encoder.layer.11.output.add_residual_input_quantizer: Overwriting amax.\n",
-      "The following columns in the evaluation set  don't have a corresponding argument in `QDQBertForSequenceClassification.forward` and have been ignored: idx, hypothesis, premise.\n",
-      "***** Running Evaluation *****\n",
-      "  Num examples = 9815\n",
-      "  Batch size = 64\n"
+      "[INFO|trainer.py:437] 2021-12-06 21:28:16,421 >> Using amp half precision backend\n"
      ]
     },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='308' max='154' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [154/154 56:29]\n",
-       "    </div>\n",
-       "    "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'eval_loss': 0.39855679869651794, 'eval_accuracy': 0.8502292409577178, 'eval_runtime': 47.3757, 'eval_samples_per_second': 207.174, 'eval_steps_per_second': 3.251, 'epoch': 1.0}\n"
+      "LR: 1.0000000000000002e-06\n",
+      "{'eval_loss': 0.38657698035240173, 'eval_accuracy': 0.8526744778400408, 'eval_runtime': 47.6064, 'eval_samples_per_second': 206.17, 'eval_steps_per_second': 3.235}\n",
+      "{'eval_loss': 0.38657698035240173, 'eval_accuracy': 0.8526744778400408, 'eval_runtime': 47.6064, 'eval_samples_per_second': 206.17, 'eval_steps_per_second': 3.235}\n",
+      "{'loss': 0.4018, 'learning_rate': 9.187581486310301e-07, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.38418063521385193, 'eval_accuracy': 0.8558329088130413, 'eval_runtime': 46.6509, 'eval_samples_per_second': 210.393, 'eval_steps_per_second': 3.301, 'epoch': 0.08}\n",
+      "{'loss': 0.3954, 'learning_rate': 8.373533246414604e-07, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.3795166015625, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.5562, 'eval_samples_per_second': 210.821, 'eval_steps_per_second': 3.308, 'epoch': 0.16}\n",
+      "{'loss': 0.3916, 'learning_rate': 7.558670143415907e-07, 'epoch': 0.24}\n",
+      "{'eval_loss': 0.3784726560115814, 'eval_accuracy': 0.8558329088130413, 'eval_runtime': 46.5355, 'eval_samples_per_second': 210.914, 'eval_steps_per_second': 3.309, 'epoch': 0.24}\n",
+      "{'loss': 0.3909, 'learning_rate': 6.743807040417211e-07, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.38643816113471985, 'eval_accuracy': 0.8565461029037188, 'eval_runtime': 46.544, 'eval_samples_per_second': 210.876, 'eval_steps_per_second': 3.309, 'epoch': 0.33}\n",
+      "{'loss': 0.3932, 'learning_rate': 5.928943937418514e-07, 'epoch': 0.41}\n",
+      "{'eval_loss': 0.3807451128959656, 'eval_accuracy': 0.8582781456953642, 'eval_runtime': 46.5617, 'eval_samples_per_second': 210.796, 'eval_steps_per_second': 3.307, 'epoch': 0.41}\n",
+      "{'loss': 0.3894, 'learning_rate': 5.114895697522818e-07, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.3824027180671692, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 46.5541, 'eval_samples_per_second': 210.83, 'eval_steps_per_second': 3.308, 'epoch': 0.49}\n",
+      "{'loss': 0.3895, 'learning_rate': 4.3008474576271193e-07, 'epoch': 0.57}\n",
+      "{'eval_loss': 0.3791654407978058, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 46.5392, 'eval_samples_per_second': 210.897, 'eval_steps_per_second': 3.309, 'epoch': 0.57}\n",
+      "{'loss': 0.388, 'learning_rate': 3.4859843546284233e-07, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.3764157295227051, 'eval_accuracy': 0.8595007641365258, 'eval_runtime': 47.0386, 'eval_samples_per_second': 208.659, 'eval_steps_per_second': 3.274, 'epoch': 0.65}\n",
+      "{'loss': 0.3928, 'learning_rate': 2.671121251629727e-07, 'epoch': 0.73}\n",
+      "{'eval_loss': 0.37711256742477417, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 48.7144, 'eval_samples_per_second': 201.48, 'eval_steps_per_second': 3.161, 'epoch': 0.73}\n",
+      "{'loss': 0.381, 'learning_rate': 1.857073011734029e-07, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.38059118390083313, 'eval_accuracy': 0.8595007641365258, 'eval_runtime': 47.0072, 'eval_samples_per_second': 208.798, 'eval_steps_per_second': 3.276, 'epoch': 0.81}\n",
+      "{'loss': 0.3798, 'learning_rate': 1.0422099087353327e-07, 'epoch': 0.9}\n",
+      "{'eval_loss': 0.3735353648662567, 'eval_accuracy': 0.8599083036169128, 'eval_runtime': 48.5826, 'eval_samples_per_second': 202.027, 'eval_steps_per_second': 3.17, 'epoch': 0.9}\n",
+      "{'loss': 0.3823, 'learning_rate': 2.2734680573663627e-08, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.3766668438911438, 'eval_accuracy': 0.8596026490066225, 'eval_runtime': 48.2033, 'eval_samples_per_second': 203.617, 'eval_steps_per_second': 3.195, 'epoch': 0.98}\n",
+      "{'train_runtime': 5010.7316, 'train_samples_per_second': 78.372, 'train_steps_per_second': 2.449, 'train_loss': 0.3895211076798619, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.3824027180671692, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 47.2938, 'eval_samples_per_second': 207.532, 'eval_steps_per_second': 3.256, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.3824027180671692, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 47.2938, 'eval_samples_per_second': 207.532, 'eval_steps_per_second': 3.256, 'epoch': 1.0}\n"
      ]
     }
    ],
    "source": [
-    "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized\", num_labels=num_labels)\n",
-    "model_q = model_q.cuda()\n",
     "\n",
-    "args.max_steps = -1\n",
+    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\"roberta-trained-quantized\", num_labels=num_labels)\n",
+    "model_roberta_q = model_roberta_q.cuda()\n",
+    "\n",
+    "args.learning_rate /= 10\n",
+    "print(f\"LR: {args.learning_rate}\")\n",
     "trainer = Trainer(\n",
-    "    model_q,\n",
+    "    model_roberta_q,\n",
     "    args,\n",
     "    train_dataset=encoded_dataset[\"train\"],\n",
     "    eval_dataset=encoded_dataset[validation_key],\n",
     "    tokenizer=tokenizer,\n",
     "    compute_metrics=compute_metrics\n",
     ")\n",
+    "transformers.logging.set_verbosity_error()\n",
     "print(trainer.evaluate())\n",
+    "# 4 batches\n",
+    "# {'eval_loss': 0.38076257705688477, 'eval_accuracy': 0.8552215995924605, 'eval_runtime': 46.9577, 'eval_samples_per_second': 209.018, 'eval_steps_per_second': 3.28}\n",
+    "# 100 batches\n",
+    "# {'eval_loss': 0.386756956577301, 'eval_accuracy': 0.8516556291390729, 'eval_runtime': 48.9996, 'eval_samples_per_second': 200.308, 'eval_steps_per_second': 3.143}\n",
     "trainer.train()\n",
     "print(trainer.evaluate())\n",
-    "model_q.save_pretrained(\"roberta-in-bert-trained-quantized-bis\")\n",
-    "del model_q"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Latency measures\n",
-    "\n",
-    "Let's see if what we have done is useful...\n"
+    "# {'eval_loss': 0.40235549211502075, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.1754, 'eval_samples_per_second': 212.559, 'eval_steps_per_second': 3.335, 'epoch': 1.0}\n",
+    "model_roberta_q.save_pretrained(\"roberta-in-bert-trained-quantized-retrained\")\n",
+    "del model_roberta_q"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "source": [
-    "## TensorRT quantized model"
+    "### Benchmark"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
+   "execution_count": 32,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "W1201 21:32:19.778870 140057592104768 tensor_quantizer.py:280] Use Pytorch's native experimental fake quantization.\n",
-      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization-2.1.2-py3.9-linux-x86_64.egg/pytorch_quantization/nn/modules/tensor_quantizer.py:285: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization/nn/modules/tensor_quantizer.py:285: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  inputs, amax.item() / bound, 0,\n",
-      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization-2.1.2-py3.9-linux-x86_64.egg/pytorch_quantization/nn/modules/tensor_quantizer.py:291: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization/nn/modules/tensor_quantizer.py:291: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  quant_dim = list(amax.shape).index(list(amax_sequeeze.shape)[0])\n"
      ]
     }
    ],
    "source": [
+    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized-retrained\", num_labels=num_labels)\n",
+    "model_roberta_q = model_roberta_q.cuda()\n",
+    "\n",
+    "data = encoded_dataset[\"train\"][1: 3]\n",
+    "input_torch = {k: torch.tensor(list(v), dtype=torch.long, device=\"cuda\") for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
+    "\n",
     "from pytorch_quantization.nn import TensorQuantizer\n",
     "TensorQuantizer.use_fb_fake_quant = True\n",
-    "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized-bis\", num_labels=num_labels)\n",
-    "model_q = model_q.cuda()\n",
-    "print(trainer.evaluate())\n",
-    "convert_to_onnx(model_q, output_path=\"model_q.onnx\", inputs_pytorch=input_torch)\n",
-    "TensorQuantizer.use_fb_fake_quant = False\n",
-    "del model_q"
+    "convert_to_onnx(model_pytorch=model_roberta_q, output_path=\"roberta_q.onnx\", inputs_pytorch=input_torch)\n",
+    "TensorQuantizer.use_fb_fake_quant = False"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "!/usr/src/tensorrt/bin/trtexec --onnx=model_q.onnx --shapes=input_ids:1x384,attention_mask:1x384 --best --workspace=6000"
+    "## Latency measures\n",
+    "\n",
+    "Let's see if what we have done is useful...\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## TensorRT baseline"
+    "### TensorRT baseline"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "baseline_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
-    "baseline_model = baseline_model.cuda()\n",
-    "convert_to_onnx(baseline_model, output_path=\"baseline.onnx\", inputs_pytorch=input_torch)\n",
-    "del baseline_model"
+    "Below we export a randomly initialized Roberta model, the purpose is to only check the performance."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 33,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "!/usr/src/tensorrt/bin/trtexec --onnx=baseline.onnx --shapes=input_ids:1x384,attention_mask:1x384 --fp16 --workspace=6000"
+    "data = encoded_dataset[\"train\"][1:10]\n",
+    "input_torch = {k: torch.tensor(list(v), dtype=torch.long, device=\"cuda\")\n",
+    "               for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
+    "\n",
+    "baseline_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
+    "baseline_model = baseline_model.cuda()\n",
+    "convert_to_onnx(baseline_model, output_path=\"baseline.onnx\", inputs_pytorch=input_torch)\n",
+    "del baseline_model"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 34,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "del baseline_model"
+    "#!/usr/src/tensorrt/bin/trtexec --onnx=baseline.onnx --shapes=input_ids:1x384,attention_mask:1x384 --best --workspace=6000"
    ]
   },
   {
diff --git a/roberta_classic.py b/roberta_classic.py
index 7afa97b0..2393d6eb 100644
--- a/roberta_classic.py
+++ b/roberta_classic.py
@@ -1,28 +1,33 @@
-from datasets import load_dataset, load_metric
-from tqdm import tqdm
-from transformer_deploy.QDQModels.QDQRoberta import QDQRobertaForSequenceClassification
+import logging
 
-from transformers import AutoTokenizer
-import pytorch_quantization.nn as quant_nn
-from pytorch_quantization.tensor_quant import QuantDescriptor
 import numpy as np
-
+import pytorch_quantization.nn as quant_nn
 import torch
+from datasets import load_dataset, load_metric
+from pytorch_quantization import calib
+from pytorch_quantization.tensor_quant import QuantDescriptor
+from tqdm import tqdm
 from transformers import (
     AutoModelForSequenceClassification,
+    AutoTokenizer,
+    IntervalStrategy,
     PreTrainedModel,
-    TrainingArguments,
     Trainer,
-    IntervalStrategy,
+    TrainingArguments,
 )
-from pytorch_quantization import calib
+
+from transformer_deploy.backends.ort_utils import convert_to_onnx
+from transformer_deploy.QDQModels.QDQRoberta import QDQRobertaForSequenceClassification
+
+
+logging.getLogger().setLevel(logging.WARNING)
 
 num_labels = 3
 model_checkpoint = "roberta-base"
 batch_size = 32
 validation_key = "validation_matched"
 dataset = load_dataset("glue", "mnli")
-metric = load_metric('glue', "mnli")
+metric = load_metric("glue", "mnli")
 nb_step = 1000
 training_strategy = IntervalStrategy.STEPS
 
@@ -30,10 +35,7 @@
 
 
 def preprocess_function(examples):
-    return tokenizer(examples["premise"], examples["hypothesis"],
-                     truncation=True,
-                     padding="max_length",
-                     max_length=256)
+    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding="max_length", max_length=256)
 
 
 def compute_metrics(eval_pred):
@@ -42,6 +44,34 @@ def compute_metrics(eval_pred):
     return metric.compute(predictions=predictions, references=labels)
 
 
+def fuse_qkv(model, quant_per_tensor: bool):
+    """Adjust quantization ranges to match an implementation where the QKV projections are implemented with a single GEMM.
+    Force the weight and output scale factors to match by taking the max of (Q,K,V).
+    """
+
+    def fuse3(qq, qk, qv):
+        for mod in [qq, qk, qv]:
+            if not hasattr(mod, "_amax"):
+                print("          WARNING: NO AMAX BUFFER")
+                return
+        q = qq._amax.detach().item()
+        k = qk._amax.detach().item()
+        v = qv._amax.detach().item()
+
+        amax = max(q, k, v)
+        qq._amax.fill_(amax)
+        qk._amax.fill_(amax)
+        qv._amax.fill_(amax)
+        print(f"          q={q:5.2f} k={k:5.2f} v={v:5.2f} -> {amax:5.2f}")
+
+    for name, mod in model.named_modules():
+        if name.endswith(".attention.self"):
+            print(f"FUSE_QKV: {name}")
+            fuse3(mod.matmul_q_input_quantizer, mod.matmul_k_input_quantizer, mod.matmul_v_input_quantizer)
+            if quant_per_tensor:
+                fuse3(mod.query._weight_quantizer, mod.key._weight_quantizer, mod.value._weight_quantizer)
+
+
 encoded_dataset = dataset.map(preprocess_function, batched=True)
 
 args = TrainingArguments(
@@ -62,7 +92,9 @@ def compute_metrics(eval_pred):
     metric_for_best_model="accuracy",
 )
 
-model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
+model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(
+    model_checkpoint, num_labels=num_labels
+)
 model_roberta = model_roberta.cuda()
 
 trainer = Trainer(
@@ -71,7 +103,7 @@ def compute_metrics(eval_pred):
     train_dataset=encoded_dataset["train"],
     eval_dataset=encoded_dataset[validation_key],
     tokenizer=tokenizer,
-    compute_metrics=compute_metrics
+    compute_metrics=compute_metrics,
 )
 print(trainer.evaluate())
 # {'eval_loss': 0.3559744358062744, 'eval_accuracy': 0.8655119714722364, 'eval_runtime': 19.6678, 'eval_samples_per_second': 499.04, 'eval_steps_per_second': 7.83, 'epoch': 0.98}
@@ -86,8 +118,9 @@ def compute_metrics(eval_pred):
 quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
 quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
 
+# keep it on CPU
 model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained("roberta-model")
-model_roberta_q = model_roberta_q.cuda()
+
 # Find the TensorQuantizer and enable calibration
 for name, module in tqdm(model_roberta_q.named_modules()):
     if isinstance(module, quant_nn.TensorQuantizer):
@@ -98,11 +131,14 @@ def compute_metrics(eval_pred):
             module.disable()
 
 with torch.no_grad():
-    for start_index in tqdm(range(0, 4*batch_size, batch_size)):
+    for start_index in tqdm(range(0, 128, batch_size)):
         end_index = start_index + batch_size
         data = encoded_dataset["train"][start_index:end_index]
-        input_torch = {k: torch.tensor(list(v), dtype=torch.long, device="cuda")
-                       for k, v in data.items() if k in ["input_ids", "attention_mask", "token_type_ids"]}
+        input_torch = {
+            k: torch.tensor(list(v), dtype=torch.long, device="cpu")
+            for k, v in data.items()
+            if k in ["input_ids", "attention_mask", "token_type_ids"]
+        }
         model_roberta_q(**input_torch)
 
 
@@ -124,7 +160,10 @@ def compute_metrics(eval_pred):
 model_roberta_q.save_pretrained("roberta-trained-quantized")
 del model_roberta_q
 
-model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained("roberta-trained-quantized", num_labels=num_labels)
+
+model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(
+    "roberta-trained-quantized", num_labels=num_labels
+)
 model_roberta_q = model_roberta_q.cuda()
 
 args.learning_rate /= 10
@@ -135,10 +174,35 @@ def compute_metrics(eval_pred):
     train_dataset=encoded_dataset["train"],
     eval_dataset=encoded_dataset[validation_key],
     tokenizer=tokenizer,
-    compute_metrics=compute_metrics
+    compute_metrics=compute_metrics,
 )
 print(trainer.evaluate())
+# 4 batches
 # {'eval_loss': 0.38076257705688477, 'eval_accuracy': 0.8552215995924605, 'eval_runtime': 46.9577, 'eval_samples_per_second': 209.018, 'eval_steps_per_second': 3.28}
+# 100 batches
+# {'eval_loss': 0.386756956577301, 'eval_accuracy': 0.8516556291390729, 'eval_runtime': 48.9996, 'eval_samples_per_second': 200.308, 'eval_steps_per_second': 3.143}
 trainer.train()
 print(trainer.evaluate())
+# {'eval_loss': 0.40235549211502075, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.1754, 'eval_samples_per_second': 212.559, 'eval_steps_per_second': 3.335, 'epoch': 1.0}
 model_roberta_q.save_pretrained("roberta-in-bert-trained-quantized-retrained")
+
+
+# fuse_qkv(model_roberta_q, quant_per_tensor=True)
+data = encoded_dataset["train"][1:3]
+input_torch = {
+    k: torch.tensor(list(v), dtype=torch.long, device="cuda")
+    for k, v in data.items()
+    if k in ["input_ids", "attention_mask", "token_type_ids"]
+}
+
+from pytorch_quantization.nn import TensorQuantizer
+
+
+TensorQuantizer.use_fb_fake_quant = True
+convert_to_onnx(model_pytorch=model_roberta_q, output_path="roberta_q.onnx", inputs_pytorch=input_torch)
+TensorQuantizer.use_fb_fake_quant = False
+# /usr/src/tensorrt/bin/trtexec --onnx=roberta_q.onnx --shapes=input_ids:1x384,attention_mask:1x384 --best --workspace=6000
+# no fusing
+# Latency: min = 1.85529 ms, max = 4.32666 ms, mean = 1.98449 ms, median = 1.87964 ms, percentile(99%) = 3.19434 ms
+# with fusing
+# Latency: min = 1.84412 ms, max = 2.22266 ms, mean = 1.87675 ms, median = 1.8717 ms, percentile(99%) = 2.07849 ms
diff --git a/src/transformer_deploy/QDQModels/QDQRoberta.py b/src/transformer_deploy/QDQModels/QDQRoberta.py
index 5f6b6369..a3db43ec 100644
--- a/src/transformer_deploy/QDQModels/QDQRoberta.py
+++ b/src/transformer_deploy/QDQModels/QDQRoberta.py
@@ -36,10 +36,11 @@
 import torch
 import torch.utils.checkpoint
 from packaging import version
+from pytorch_quantization import nn as quant_nn
+from pytorch_quantization.nn.modules.tensor_quantizer import TensorQuantizer
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import RobertaConfig
-
 from transformers.activations import ACT2FN, gelu
 from transformers.file_utils import (
     add_code_sample_docstrings,
@@ -65,8 +66,6 @@
 )
 from transformers.utils import logging
 
-from pytorch_quantization import nn as quant_nn
-from pytorch_quantization.nn.modules.tensor_quantizer import TensorQuantizer
 
 logger = logging.get_logger(__name__)
 
@@ -195,9 +194,7 @@ def __init__(self, config, position_embedding_type=None):
         self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(
-            config, "position_embedding_type", "absolute"
-        )
+        self.position_embedding_type = position_embedding_type or getattr(config, "position_embedding_type", "absolute")
         if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             self.max_position_embeddings = config.max_position_embeddings
             self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
@@ -1627,7 +1624,6 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
     Returns: torch.Tensor
     """
     # QDQ change below
-    # return torch.zeros(input_ids.shape, dtype=torch.long, device="cuda")
     # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
     # int() -> float() because of a limitations in cumsum operator implementation in TensorRT
     mask = input_ids.ne(padding_idx).float()
diff --git a/src/transformer_deploy/backends/trt_utils.py b/src/transformer_deploy/backends/trt_utils.py
index d8b352c7..872b6b41 100644
--- a/src/transformer_deploy/backends/trt_utils.py
+++ b/src/transformer_deploy/backends/trt_utils.py
@@ -157,13 +157,11 @@ def build_engine(
                 )
                 if int8:
                     config.set_flag(trt.BuilderFlag.INT8)
-                    # config.set_quantization_flag(trt.QuantizationFlag.CALIBRATE_BEFORE_FUSION)
-                    # config.int8_calibrator = Calibrator()
-                # if fp16:
-                config.set_flag(trt.BuilderFlag.FP16)
+                if fp16:
+                    config.set_flag(trt.BuilderFlag.FP16)
                 config.set_flag(trt.BuilderFlag.DISABLE_TIMING_CACHE)
                 # https://github.com/NVIDIA/TensorRT/issues/1196 (sometimes big diff in output when using FP16)
-                config.set_flag(trt.BuilderFlag.STRICT_TYPES)
+                config.set_flag(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
                 with open(onnx_file_path, "rb") as f:
                     parser.parse(f.read())
                 profile: IOptimizationProfile = builder.create_optimization_profile()
diff --git a/src/transformer_deploy/convert.py b/src/transformer_deploy/convert.py
index dd56c459..e50687d4 100644
--- a/src/transformer_deploy/convert.py
+++ b/src/transformer_deploy/convert.py
@@ -25,6 +25,7 @@
 import tensorrt as trt
 import torch
 from pycuda._driver import Stream
+from pytorch_quantization.nn import TensorQuantizer
 from tensorrt.tensorrt import IExecutionContext, Logger, Runtime
 from torch.cuda import get_device_name
 from torch.cuda.amp import autocast
@@ -40,7 +41,6 @@
 )
 from transformer_deploy.benchmarks.utils import prepare_input, print_timings, setup_logging, track_infer_time
 from transformer_deploy.templates.triton import Configuration, ModelType
-from pytorch_quantization.nn import TensorQuantizer
 
 
 def main():
@@ -130,8 +130,17 @@ def main():
 
     timings = {}
 
-    if "pytorch" in args.backend:
-        with torch.inference_mode():
+    with torch.inference_mode():
+        for _ in range(args.warmup):
+            _ = model_pytorch(**inputs_pytorch)
+            torch.cuda.synchronize()
+        time_buffer = []
+        for _ in range(args.nb_measures):
+            with track_infer_time(time_buffer):
+                _ = model_pytorch(**inputs_pytorch)
+                torch.cuda.synchronize()
+        timings["Pytorch (FP32)"] = time_buffer
+        with autocast():
             for _ in range(args.warmup):
                 _ = model_pytorch(**inputs_pytorch)
                 torch.cuda.synchronize()
@@ -140,17 +149,7 @@ def main():
                 with track_infer_time(time_buffer):
                     _ = model_pytorch(**inputs_pytorch)
                     torch.cuda.synchronize()
-            timings["Pytorch (FP32)"] = time_buffer
-            with autocast():
-                for _ in range(args.warmup):
-                    _ = model_pytorch(**inputs_pytorch)
-                    torch.cuda.synchronize()
-                time_buffer = []
-                for _ in range(args.nb_measures):
-                    with track_infer_time(time_buffer):
-                        _ = model_pytorch(**inputs_pytorch)
-                        torch.cuda.synchronize()
-                timings["Pytorch (FP16)"] = time_buffer
+            timings["Pytorch (FP16)"] = time_buffer
     del model_pytorch
 
     if "tensorrt" in args.backend:
@@ -164,7 +163,7 @@ def main():
             optimal_shape=tensor_shapes[1],
             max_shape=tensor_shapes[2],
             workspace_size=args.workspace_size * 1024 * 1024,
-            fp16=True,
+            fp16=not args.quantization,
             int8=args.quantization,
         )
         save_engine(engine=engine, engine_file_path=tensorrt_path)

From dc3fc1920d0b03e8d361b20e86c1740621f0910e Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Tue, 7 Dec 2021 23:09:13 +0100
Subject: [PATCH 06/15] update quantization notebook

---
 .gitignore                                    |    1 +
 README.md                                     |    2 +-
 ...on.ipynb => quantization_end_to_end.ipynb} | 1755 ++++++-----------
 requirements.txt                              |    2 +-
 requirements_gpu.txt                          |    1 -
 src/transformer_deploy/backends/trt_utils.py  |   44 +-
 src/transformer_deploy/convert.py             |   22 +-
 7 files changed, 680 insertions(+), 1147 deletions(-)
 rename demo/{text_classification_quantization.ipynb => quantization_end_to_end.ipynb} (63%)

diff --git a/.gitignore b/.gitignore
index 5c719f5c..5f327c39 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,3 +216,4 @@ cython_debug/
 .idea/
 TensorRT/
 triton_models/
+demo/roberta-*/
diff --git a/README.md b/README.md
index c8c14879..e74842cd 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# From 🤗 to 🤯, Hugging Face Transformer submillisecond inference️ and deployment to production
+# Hugging Face Transformer submillisecond inference️ and deployment to production: 🤗 → 🤯
 
 [![tests](https://github.com/ELS-RD/transformer-deploy/actions/workflows/python-app.yml/badge.svg)](https://github.com/ELS-RD/transformer-deploy/actions/workflows/python-app.yml) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](./LICENCE) [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
 
diff --git a/demo/text_classification_quantization.ipynb b/demo/quantization_end_to_end.ipynb
similarity index 63%
rename from demo/text_classification_quantization.ipynb
rename to demo/quantization_end_to_end.ipynb
index 26fe5ca5..2c16c604 100644
--- a/demo/text_classification_quantization.ipynb
+++ b/demo/quantization_end_to_end.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# A recipe to perform Nvidia GPU int-8 quantization on most transformers model"
+    "# Recipes to perform Nvidia GPU INT-8 quantization on most transformers model"
    ]
   },
   {
@@ -12,62 +12,72 @@
    "metadata": {},
    "source": [
     "Quantization is one of the most effective and generic approach to make model inference faster.\n",
-    "Basically it replaces float numbers generally encoded in 16 or 32 bits by integers encoded in 8 bits or less:\n",
+    "Basically, it replaces high precision float numbers in model tensors encoded in 32 or 16 bits by lower precision ones encoded in 8 bits or less:\n",
     "\n",
     "* it takes less memory\n",
     "* computation is easier / faster\n",
     "\n",
-    "**GPU quantization is a way to double the inference speed of your GPU**.\n",
-    "It can be applied to any model in theory, and unlike distillation, if done well, it should not decrease your model accuracy.\n",
+    "It can be applied to any model in theory, and, if done well, it should not decrease model accuracy.\n",
     "\n",
     "The purpose of this tutorial is to show 2 processes to perform quantization on most `transformer` architecture.\n",
     "\n",
-    "## What is int-8 quantization?\n",
+    "**TL;DR, inference is 5 times faster on a `Roberta-base` model** with a batch of size 32 / seq len 384, benchmark (bold, quantization):\n",
     "\n",
-    "Basic idea behind the expression int-8 quantization is that instead of doing deep learning computations with `float` numbers (usually encoded on 32 bits), you use integers (encoded on 8 bits). On a large matrix multiplication it has 2 effects:\n",
+    "| Framework                  | Precision | Latency (ms) | Accuracy | Speedup |\n",
+    "| -------------------------- | --------- | ------------ | -------- | ------- |\n",
+    "| Pytorch                    | FP32      | 76.31        | ---      | X 1     |\n",
+    "| Pytorch                    | FP16      | 56.04        | ---      | X 1.4   |\n",
+    "| TensorRT                   | FP16      | 30.17        | ---      | X 2.5   |\n",
+    "| TensorRT (transplantation) | **INT-8** | 15.83        | ---      | **X 5** |\n",
+    "| TensorRT (custom QDQ code) | **INT-8** | 14.94        | ---      | **X 5** |\n",
     "\n",
-    "* it reduces by a large margin the size in memory, making **memory transfer faster** (on GPU, many operations are very fast to compute, and memory transfer is the main bottleneck, they are called memory bound)\n",
-    "* it also makes **computation faster** accelerating the slowest operations (in transformer, mainly big matrix multiplication during the self attention comptutation)\n",
+    "> same kind of acceleration is observed on all seq len / batch sizes\n",
     "\n",
-    "A 8-bit integer can encode values from -128 to +127, and no decimal (as it's an integer).\n",
-    "So a 8-bit integer can't encode values like `1280.872654`.\n",
     "\n",
-    "However we can use our integer if it's associated to a scale (a FP32 scale). For instance, for a scale of 20, I can set my integer to 64 (64*20=1280), it's not exactly `1280.872654` but it's close enough.\n",
+    "## A (very) short intro to INT-8 quantization\n",
     "\n",
-    "That's why we need to perform a step called `calibration` during which the range of values and the scale (encoded as a FP32 float) will be computed.\n",
+    "Basic idea behind model quantization is to replace tensors made of float numbers (usually encoded on 32 bits) by lower precision representation (encoded on 8 bits for Nvidia GPUs), in general integers.\n",
+    "Therefore computation is faster and model memory footprint is lower. Making tensor storage smaller makes memory transfer faster... and is also a computation acceleration factor.\n",
+    "This technic is very interesting for its trade-off: you reduce inference time significantly, and in most scenarios it cost close to nothing in accuracy.\n",
     "\n",
-    "Basically, we know that by converting a FP32 to an int-8 and its scale, we will lose some information, and the goal of the calibration is to minimize this loss.\n",
+    "Replacing float numbers by integers is done through a mapping.\n",
+    "This step is called `calibration`, and its purpose is to compute for each tensor or each channel of a tensor (one of its dimensions) a range of all possible values and then define a scale and a distribution center to map float numbers to 8 bits integers.\n",
+    "The process is well described in this [Nvidia presentation](https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf).\n",
     "\n",
-    "If in a matrix, values go from -1.5 to +2, it may be encoded as an integer taking value from -127 to +127, associated to a scale of 64 (2*64=128)\n",
+    "There are several ways to perform quantization, depending of how and when the `calibration` is performed:\n",
     "\n",
+    "* dynamically: the mapping is done during the inference, there are some overhead but it's easy to put in place and usually the accuracy is preserved,\n",
+    "* statically, after training (`post training quantization` or `PTQ`): this way is efficient, but it may have a significant accuracy cost,\n",
+    "* statically, before training (`quantization aware training` or `QAT`): this way is efficient and has a low accuracy cost as the weights will take care of the result\n",
     "\n",
-    "[A good documentation on quantization](https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf)\n",
+    "In this guide we will focus on the third option: QAT.\n",
     "\n",
+    "During a quantization aware training:\n",
+    "\n",
+    "* in the inside, Pytorch will work with high precision float numbers,\n",
+    "* on the outside, Pytorch will simulate that a quantization has already been applied and output results accordingly (for loss computation for instance)\n",
+    "* it will also refine the quantization mapping (scale, range, distribution center, etc.)\n",
+    "\n",
+    "You can check this [high quality blog post](https://leimao.github.io/article/Neural-Networks-Quantization/) for more information.\n",
     "\n",
     "## Why a dedicated tutorial?\n",
     "\n",
-    "CPU quantization is supported out of the box by `Pytorch` or ONNX Runtime.\n",
-    "GPU quantization on the other side requires specific tools and process to be applied.\n",
+    "CPU quantization is supported out of the box by `Pytorch` or `ONNX Runtime`.\n",
+    "**GPU quantization on the other side requires specific tools and process to be applied**.\n",
     "\n",
-    "In the specific case of `transformer` models, right now (december 2021), the only way shown by Nvidia is to build manually the graph of your model in `TensorRT`. This is a low level approach, based on GPU capacity knowledge (which operator are supported, etc.). It's certainly out of reach of most NLP practitioners.\n",
+    "In the specific case of `transformer` models, right now (december 2021), the only way shown by Nvidia is to build manually the graph of our models in `TensorRT`. This is a low level approach, based on GPU capacity knowledge (which operators are supported, etc.). It's certainly out of reach of most NLP practitioners and is very time consuming to update/adapt to new architectures.\n",
     "\n",
     "Hopefully, Nvidia recently added to Hugging Face `transformer` library a new model called `QDQBert`.\n",
-    "Basically, it's a vanilla `Bert` architecture which supports int-8 quantization.\n",
+    "Basically, it's a vanilla `Bert` architecture which supports INT-8 quantization.\n",
     "It doesn't support any other architecture out of the box, like `Albert`, `Roberta`, or `Electra`.\n",
-    "The Nvidia demo is dedicated to SQuaD task.\n",
-    "\n",
-    "The code from Nvidia only supports out of the box vanilla `Bert` model (and not similar models, like RoBerta & co).\n",
-    "The demo from Nvidia is on the SQuaD task, it's cool but it makes the code a lot less clear that needed.\n",
+    "Nvidia also provide a demo dedicated to the SQuaD task.\n",
     "\n",
     "To be both simple and cover most use cases, in this tutorial we will see:\n",
     "\n",
-    "* how to perform GPU quantization on **any** transformer model (not just Bert) using a simple trick\n",
-    "* how to to apply quantization to a common task like classification (which is easier to understand than question answering)\n",
-    "* measure performance gain (latency)\n",
-    "\n",
-    "## ToC\n",
-    "\n",
-    "### [Dependencies](#Dependencies-installation)"
+    "* how to perform GPU quantization on **any** transformer model (not just Bert) using a simple trick, a `transplatation`\n",
+    "* how to perform GPU quantization on `QDQRoberta`, a custom model similar to `QDQBert` and supported by `transformer-deploy` library\n",
+    "* how to apply quantization to a common task like classification (which is easier to understand than question answering)\n",
+    "* measure performance gain (latency)\n"
    ]
   },
   {
@@ -96,8 +106,9 @@
    "source": [
     "#! pip install git+https://github.com/huggingface/transformers\n",
     "#! pip install git+https://github.com/ELS-RD/transformer-deploy\n",
-    "#! pip install sklearn datasets -U\n",
-    "#! pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com"
+    "#! pip install sklearn datasets\n",
+    "#! pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com\n",
+    "# or install pytorch-quantization from https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization"
    ]
   },
   {
@@ -122,7 +133,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Mon Dec  6 17:39:28 2021       \r\n",
+      "Tue Dec  7 21:16:44 2021       \r\n",
       "+-----------------------------------------------------------------------------+\r\n",
       "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\r\n",
       "|-------------------------------+----------------------+----------------------+\r\n",
@@ -131,7 +142,7 @@
       "|                               |                      |               MIG M. |\r\n",
       "|===============================+======================+======================|\r\n",
       "|   0  NVIDIA GeForce ...  On   | 00000000:03:00.0  On |                  N/A |\r\n",
-      "| 70%   55C    P8    47W / 350W |    304MiB / 24267MiB |     15%      Default |\r\n",
+      "| 35%   42C    P8    40W / 350W |    221MiB / 24267MiB |      0%      Default |\r\n",
       "|                               |                      |                  N/A |\r\n",
       "+-------------------------------+----------------------+----------------------+\r\n",
       "                                                                               \r\n",
@@ -140,11 +151,10 @@
       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\r\n",
       "|        ID   ID                                                   Usage      |\r\n",
       "|=============================================================================|\r\n",
-      "|    0   N/A  N/A      1636      G   /usr/lib/xorg/Xorg                162MiB |\r\n",
-      "|    0   N/A  N/A      7876      G   /usr/bin/gnome-shell               45MiB |\r\n",
-      "|    0   N/A  N/A     21136      G   ...AAAAAAAAA= --shared-files       20MiB |\r\n",
-      "|    0   N/A  N/A    129021      G   ...AAAAAAAAA= --shared-files       38MiB |\r\n",
-      "|    0   N/A  N/A   2438985      G   ...359197.log --shared-files       33MiB |\r\n",
+      "|    0   N/A  N/A      1903      G   /usr/lib/xorg/Xorg                124MiB |\r\n",
+      "|    0   N/A  N/A      7277      G   /usr/bin/gnome-shell               38MiB |\r\n",
+      "|    0   N/A  N/A     58986      G   ..._57461.log --shared-files       19MiB |\r\n",
+      "|    0   N/A  N/A     63844      G   ...AAAAAAAAA= --shared-files       35MiB |\r\n",
       "+-----------------------------------------------------------------------------+\r\n"
      ]
     }
@@ -193,7 +203,7 @@
     "import numpy as np\n",
     "from tqdm.notebook import tqdm\n",
     "\n",
-    "from typing import Dict, OrderedDict, List\n",
+    "from typing import OrderedDict as OD, List, Dict\n",
     "import torch\n",
     "from torch import Tensor\n",
     "from transformers import (\n",
@@ -212,7 +222,11 @@
     "import logging\n",
     "import transformers\n",
     "import datasets\n",
-    "from transformer_deploy.backends.trt_utils import build_engine, get_binding_idxs, infer_tensorrt"
+    "from datasets import DatasetDict\n",
+    "from transformer_deploy.backends.trt_utils import build_engine, get_binding_idxs, infer_tensorrt, load_engine\n",
+    "from transformer_deploy.backends.ort_utils import convert_to_onnx\n",
+    "from collections import OrderedDict\n",
+    "from transformer_deploy.benchmarks.utils import track_infer_time, print_timings"
    ]
   },
   {
@@ -228,7 +242,7 @@
     "from pycuda._driver import Stream\n",
     "import tensorrt as trt\n",
     "from tensorrt.tensorrt import IExecutionContext, Logger, Runtime\n",
-    "import pycuda.autoinit\n"
+    "import pycuda.autoinit"
    ]
   },
   {
@@ -283,7 +297,8 @@
     "model_checkpoint = \"roberta-base\"\n",
     "batch_size = 32\n",
     "max_seq_len = 256\n",
-    "validation_key = \"validation_matched\""
+    "validation_key = \"validation_matched\"\n",
+    "timings: Dict[str, List[float]] = dict()"
    ]
   },
   {
@@ -305,7 +320,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "18466bdd0e5b4e819e3bdadfa574eaa2",
+       "model_id": "e6b052cb482a4df89d092e40bca792b3",
        "version_major": 2,
        "version_minor": 0
       },
@@ -352,7 +367,7 @@
     "from datasets import load_dataset, load_metric\n",
     "\n",
     "dataset = load_dataset(\"glue\", task)\n",
-    "metric = load_metric('glue', task)\n",
+    "metric = load_metric(\"glue\", task)\n",
     "dataset"
    ]
   },
@@ -439,9 +454,9 @@
    },
    "outputs": [],
    "source": [
-    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoTokenizer, PreTrainedTokenizer\n",
     "\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
+    "tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
    ]
   },
   {
@@ -462,7 +477,9 @@
    "outputs": [],
    "source": [
     "def preprocess_function(examples):\n",
-    "    return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True, padding=\"max_length\", max_length=max_seq_len)"
+    "    return tokenizer(\n",
+    "        examples[\"premise\"], examples[\"hypothesis\"], truncation=True, padding=\"max_length\", max_length=max_seq_len\n",
+    "    )"
    ]
   },
   {
@@ -519,26 +536,17 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "id": "545PP3o8IrJV"
-   },
-   "source": [
-    "## Fine-tuning model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "FBiW8UpKIrJW"
+    "pycharm": {
+     "name": "#%% md\n"
+    }
    },
    "source": [
-    "Now that our data are ready, we can download the pretrained model and fine-tune it.\n",
-    "\n",
-    "We will also prepare some export function right now"
+    "Some functions required for training and exporting the model:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 48,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -546,24 +554,7 @@
    },
    "outputs": [],
    "source": [
-    "def convert_to_onnx(model_pytorch: PreTrainedModel, output_path: str, inputs_pytorch: Dict[str, torch.Tensor]) -> None:\n",
-    "    with torch.no_grad():\n",
-    "        torch.onnx.export(\n",
-    "            model_pytorch,  # model to optimize\n",
-    "            args=(inputs_pytorch[\"input_ids\"], inputs_pytorch[\"attention_mask\"]),  # tuple of multiple inputs , inputs_pytorch[\"token_type_ids\"]\n",
-    "            f=output_path,  # output path / file object\n",
-    "            opset_version=13,  # the ONNX version to use, 13 is the first to support QDQ nodes\n",
-    "            do_constant_folding=True,  # simplify model (replace constant expressions)\n",
-    "            input_names=[\"input_ids\", \"attention_mask\"],  # input names \"token_type_ids\"\n",
-    "            output_names=[\"model_output\"],  # output name\n",
-    "            dynamic_axes={  # declare dynamix axis for each input / output (dynamic axis == variable length axis)\n",
-    "                \"input_ids\": {0: \"batch_size\", 1: \"sequence\"},\n",
-    "                \"attention_mask\": {0: \"batch_size\", 1: \"sequence\"},\n",
-    "                #\"token_type_ids\": {0: \"batch_size\", 1: \"sequence\"},\n",
-    "                \"model_output\": {0: \"batch_size\"},\n",
-    "            },\n",
-    "            verbose=False,\n",
-    "        )\n",
+    "from typing import Union\n",
     "\n",
     "\n",
     "def compute_metrics(eval_pred):\n",
@@ -575,9 +566,9 @@
     "    return metric.compute(predictions=predictions, references=labels)\n",
     "\n",
     "\n",
-    "def calibrate(model: PreTrainedModel, encoded_dataset, nb_sample: int=128) -> None:\n",
+    "def calibrate(model: PreTrainedModel, encoded_dataset: DatasetDict, nb_sample: int = 128) -> PreTrainedModel:\n",
     "    # Find the TensorQuantizer and enable calibration\n",
-    "    for name, module in tqdm(model.named_modules()):\n",
+    "    for name, module in model.named_modules():\n",
     "        if isinstance(module, quant_nn.TensorQuantizer):\n",
     "            if module._calibrator is not None:\n",
     "                module.disable_quant()\n",
@@ -589,11 +580,13 @@
     "        for start_index in tqdm(range(0, nb_sample, batch_size)):\n",
     "            end_index = start_index + batch_size\n",
     "            data = encoded_dataset[\"train\"][start_index:end_index]\n",
-    "            input_torch = {k: torch.tensor(list(v), dtype=torch.long, device=\"cpu\")\n",
-    "                           for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
+    "            input_torch = {\n",
+    "                k: torch.tensor(v, dtype=torch.long, device=\"cpu\")\n",
+    "                for k, v in data.items()\n",
+    "                if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]\n",
+    "            }\n",
     "            model(**input_torch)\n",
     "\n",
-    "\n",
     "    # Finalize calibration\n",
     "    for name, module in model.named_modules():\n",
     "        if isinstance(module, quant_nn.TensorQuantizer):\n",
@@ -607,7 +600,23 @@
     "            else:\n",
     "                module.enable()\n",
     "\n",
-    "    model.cuda()\n"
+    "    model.cuda()\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def convert_tensor(data: OD[str, List[List[int]]], output: str) -> OD[str, Union[np.ndarray, torch.Tensor]]:\n",
+    "    input: OD[str, Union[np.ndarray, torch.Tensor]] = OrderedDict()\n",
+    "    for k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]:\n",
+    "        if k in data:\n",
+    "            v = data[k]\n",
+    "            if output == \"torch\":\n",
+    "                value = torch.tensor(v, dtype=torch.long, device=\"cuda\")\n",
+    "            elif output == \"np\":\n",
+    "                value = np.asarray(v, dtype=np.int32)\n",
+    "            else:\n",
+    "                raise Exception(f\"unknown output type: {output}\")\n",
+    "            input[k] = value\n",
+    "    return input"
    ]
   },
   {
@@ -620,6 +629,22 @@
     "profile_index = 0"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fine-tuning model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that our data are ready, we can download the pretrained model and fine-tune it.\n",
+    "\n",
+    "We will also prepare some export function right now"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -640,15 +665,15 @@
     "nb_step = 1000\n",
     "strategy = IntervalStrategy.STEPS\n",
     "args = TrainingArguments(\n",
-    "    f\"{model_checkpoint}-finetuned-{task}\",\n",
-    "    evaluation_strategy = strategy,\n",
+    "    f\"{model_checkpoint}-{task}\",\n",
+    "    evaluation_strategy=strategy,\n",
     "    eval_steps=nb_step,\n",
     "    logging_steps=nb_step,\n",
     "    save_steps=nb_step,\n",
-    "    save_strategy = strategy,\n",
+    "    save_strategy=strategy,\n",
     "    learning_rate=1e-5,\n",
     "    per_device_train_batch_size=batch_size,\n",
-    "    per_device_eval_batch_size=batch_size*2,\n",
+    "    per_device_eval_batch_size=batch_size * 2,\n",
     "    num_train_epochs=1,\n",
     "    fp16=True,\n",
     "    group_by_length=True,\n",
@@ -697,14 +722,18 @@
    },
    "outputs": [],
    "source": [
-    "model_bert: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=num_labels)\n",
+    "model_bert: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    \"bert-base-uncased\", num_labels=num_labels\n",
+    ")\n",
     "bert_keys = list(model_bert.state_dict().keys())\n",
     "del model_bert\n",
     "\n",
-    "model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
+    "model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_checkpoint, num_labels=num_labels\n",
+    ")\n",
     "model_roberta.save_pretrained(\"roberta-in-bert\")\n",
     "del model_roberta\n",
-    "model_weights: OrderedDict[str, Tensor] = torch.load(\"roberta-in-bert/pytorch_model.bin\")\n",
+    "model_weights: OD[str, Tensor] = torch.load(\"roberta-in-bert/pytorch_model.bin\")\n",
     "\n",
     "# Roberta -> Bert, there is 1 to 1 correspondance, for other models, you may need to create your own mapping.\n",
     "for bert_key in bert_keys:\n",
@@ -736,7 +765,7 @@
     "\n",
     "with open(\"roberta-in-bert/config.json\") as f:\n",
     "    content = json.load(f)\n",
-    "    content['architectures'] = [\"bert\"]\n",
+    "    content[\"architectures\"] = [\"bert\"]\n",
     "\n",
     "with open(\"roberta-in-bert/config.json\", mode=\"w\") as f:\n",
     "    json.dump(content, f)"
@@ -768,40 +797,22 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[INFO|trainer.py:437] 2021-12-06 17:39:49,638 >> Using amp half precision backend\n"
+      "[INFO|trainer.py:437] 2021-12-07 16:55:07,169 >> Using amp half precision backend\n"
      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'loss': 0.7303, 'learning_rate': 9.1875814863103e-06, 'epoch': 0.08}\n",
-      "{'eval_loss': 0.5143516659736633, 'eval_accuracy': 0.8018339276617422, 'eval_runtime': 18.9153, 'eval_samples_per_second': 518.892, 'eval_steps_per_second': 8.142, 'epoch': 0.08}\n",
-      "{'loss': 0.5419, 'learning_rate': 8.373533246414604e-06, 'epoch': 0.16}\n",
-      "{'eval_loss': 0.4696938693523407, 'eval_accuracy': 0.8183392766174223, 'eval_runtime': 19.0652, 'eval_samples_per_second': 514.813, 'eval_steps_per_second': 8.078, 'epoch': 0.16}\n",
-      "{'loss': 0.5056, 'learning_rate': 7.558670143415907e-06, 'epoch': 0.24}\n",
-      "{'eval_loss': 0.4684630036354065, 'eval_accuracy': 0.819969434538971, 'eval_runtime': 18.5425, 'eval_samples_per_second': 529.326, 'eval_steps_per_second': 8.305, 'epoch': 0.24}\n",
-      "{'loss': 0.4806, 'learning_rate': 6.744621903520209e-06, 'epoch': 0.33}\n",
-      "{'eval_loss': 0.42402705550193787, 'eval_accuracy': 0.8364747834946511, 'eval_runtime': 18.5925, 'eval_samples_per_second': 527.901, 'eval_steps_per_second': 8.283, 'epoch': 0.33}\n",
-      "{'loss': 0.4637, 'learning_rate': 5.929758800521513e-06, 'epoch': 0.41}\n",
-      "{'eval_loss': 0.41743752360343933, 'eval_accuracy': 0.8404482934284259, 'eval_runtime': 18.5681, 'eval_samples_per_second': 528.596, 'eval_steps_per_second': 8.294, 'epoch': 0.41}\n",
-      "{'loss': 0.4501, 'learning_rate': 5.1148956975228174e-06, 'epoch': 0.49}\n",
-      "{'eval_loss': 0.4184797704219818, 'eval_accuracy': 0.8368823229750382, 'eval_runtime': 18.5308, 'eval_samples_per_second': 529.658, 'eval_steps_per_second': 8.31, 'epoch': 0.49}\n",
-      "{'loss': 0.4488, 'learning_rate': 4.3008474576271195e-06, 'epoch': 0.57}\n",
-      "{'eval_loss': 0.397051602602005, 'eval_accuracy': 0.8456444218033622, 'eval_runtime': 18.5969, 'eval_samples_per_second': 527.776, 'eval_steps_per_second': 8.281, 'epoch': 0.57}\n",
-      "{'loss': 0.4404, 'learning_rate': 3.4859843546284226e-06, 'epoch': 0.65}\n",
-      "{'eval_loss': 0.39308467507362366, 'eval_accuracy': 0.8465613856342333, 'eval_runtime': 18.582, 'eval_samples_per_second': 528.201, 'eval_steps_per_second': 8.288, 'epoch': 0.65}\n",
-      "{'loss': 0.4311, 'learning_rate': 2.6711212516297265e-06, 'epoch': 0.73}\n",
-      "{'eval_loss': 0.39400529861450195, 'eval_accuracy': 0.8489047376464595, 'eval_runtime': 18.5238, 'eval_samples_per_second': 529.86, 'eval_steps_per_second': 8.314, 'epoch': 0.73}\n",
-      "{'loss': 0.4226, 'learning_rate': 1.8562581486310302e-06, 'epoch': 0.81}\n",
-      "{'eval_loss': 0.38930612802505493, 'eval_accuracy': 0.8527763627101376, 'eval_runtime': 18.5207, 'eval_samples_per_second': 529.948, 'eval_steps_per_second': 8.315, 'epoch': 0.81}\n",
-      "{'loss': 0.4239, 'learning_rate': 1.0413950456323338e-06, 'epoch': 0.9}\n",
-      "{'eval_loss': 0.38341203331947327, 'eval_accuracy': 0.85206316861946, 'eval_runtime': 18.552, 'eval_samples_per_second': 529.052, 'eval_steps_per_second': 8.301, 'epoch': 0.9}\n",
-      "{'loss': 0.4242, 'learning_rate': 2.2816166883963498e-07, 'epoch': 0.98}\n",
-      "{'eval_loss': 0.3831214904785156, 'eval_accuracy': 0.8536933265410087, 'eval_runtime': 18.5149, 'eval_samples_per_second': 530.113, 'eval_steps_per_second': 8.318, 'epoch': 0.98}\n",
-      "{'train_runtime': 2654.3429, 'train_samples_per_second': 147.947, 'train_steps_per_second': 4.623, 'train_loss': 0.4790087224918052, 'epoch': 1.0}\n",
-      "{'eval_loss': 0.3831214904785156, 'eval_accuracy': 0.8536933265410087, 'eval_runtime': 18.5645, 'eval_samples_per_second': 528.697, 'eval_steps_per_second': 8.295, 'epoch': 1.0}\n",
-      "{'eval_loss': 0.3831214904785156, 'eval_accuracy': 0.8536933265410087, 'eval_runtime': 18.5645, 'eval_samples_per_second': 528.697, 'eval_steps_per_second': 8.295, 'epoch': 1.0}\n"
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
+      "\u001B[0;32m/tmp/ipykernel_1083276/2185767024.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m     12\u001B[0m )\n\u001B[1;32m     13\u001B[0m \u001B[0mtransformers\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlogging\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_verbosity_error\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 14\u001B[0;31m \u001B[0mtrainer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrain\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     15\u001B[0m \u001B[0mprint\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mevaluate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     16\u001B[0m \u001B[0mmodel_bert\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msave_pretrained\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m\"roberta-in-bert-trained\"\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/transformers/trainer.py\u001B[0m in \u001B[0;36mtrain\u001B[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001B[0m\n\u001B[1;32m   1321\u001B[0m                         \u001B[0mtr_loss_step\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1322\u001B[0m                 \u001B[0;32melse\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1323\u001B[0;31m                     \u001B[0mtr_loss_step\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m   1324\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1325\u001B[0m                 if (\n",
+      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/transformers/trainer.py\u001B[0m in \u001B[0;36mtraining_step\u001B[0;34m(self, model, inputs)\u001B[0m\n\u001B[1;32m   1875\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1876\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdo_grad_scaling\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1877\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscaler\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscale\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mloss\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mbackward\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m   1878\u001B[0m         \u001B[0;32melif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0muse_apex\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1879\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mamp\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscale_loss\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mloss\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0moptimizer\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mscaled_loss\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/torch/_tensor.py\u001B[0m in \u001B[0;36mbackward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m    305\u001B[0m                 \u001B[0mcreate_graph\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    306\u001B[0m                 inputs=inputs)\n\u001B[0;32m--> 307\u001B[0;31m         \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mautograd\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mbackward\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgradient\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mretain_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    308\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    309\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mregister_hook\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mhook\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/torch/autograd/__init__.py\u001B[0m in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m    152\u001B[0m         \u001B[0mretain_graph\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 154\u001B[0;31m     Variable._execution_engine.run_backward(\n\u001B[0m\u001B[1;32m    155\u001B[0m         \u001B[0mtensors\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgrad_tensors_\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mretain_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    156\u001B[0m         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag\n",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
      ]
     }
    ],
@@ -867,7 +878,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -894,742 +905,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8ed1b47f25084ffb98165b5a5ba60d22",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "0it [00:00, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "27b6e88f50be4e229bdcd4e992a83467",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/4 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "QDQBertForSequenceClassification(\n",
-       "  (bert): QDQBertModel(\n",
-       "    (embeddings): QDQBertEmbeddings(\n",
-       "      (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
-       "      (position_embeddings): Embedding(514, 768)\n",
-       "      (token_type_embeddings): Embedding(1, 768)\n",
-       "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "      (dropout): Dropout(p=0.1, inplace=False)\n",
-       "    )\n",
-       "    (encoder): QDQBertEncoder(\n",
-       "      (layer): ModuleList(\n",
-       "        (0): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.3825 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.2278, 0.7138](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.3825 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.2136, 0.8620](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.3825 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0559, 0.3011](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=20.8514 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0919 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8985 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.9990 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.6973 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0893, 0.8268](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.7937 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.3825 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.6473 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0784, 0.9981](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.0149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1539, 1.0117](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.3320 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.6473 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (1): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.9149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1719, 0.5387](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.9149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1739, 0.7034](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.9149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0692, 0.3668](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.8894 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=11.1935 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.4753 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.9980 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.0571 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0568, 0.8128](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8345 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=14.9149 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.7561 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0939, 0.9761](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.4792 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1338, 1.0384](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.1303 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.7561 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (2): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1519, 0.5226](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1586, 0.6574](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0927, 0.5691](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.8590 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.8922 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.5050 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.7881 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.5292 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0546, 0.5824](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.5733 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.2597 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=25.3726 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0962, 0.6515](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.4743 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1189, 0.9865](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.3749 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=25.3726 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (3): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.5269 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1655, 0.6085](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.5269 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1591, 0.6744](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.5269 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1096, 0.4942](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=11.2382 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.1940 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.0930 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.6143 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.4349 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0487, 0.6398](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.4685 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.5269 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.5186 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0921, 0.6942](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.5658 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1252, 1.0283](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.2256 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.5186 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (4): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0308 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1818, 0.6171](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0308 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1737, 0.6774](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0308 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0965, 0.3672](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.3299 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.9871 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.9571 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5410 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.0579 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0243, 0.5534](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.5376 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=19.0308 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8352 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0947, 0.6763](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.8965 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1001, 1.0148](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.1937 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8352 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (5): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.2023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1441, 0.5500](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.2023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1398, 0.6392](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.2023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1045, 0.3702](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.6675 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=10.5083 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.8853 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5531 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.7910 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0132, 0.5822](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.0695 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.2023 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.2437 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0828, 0.6019](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.6517 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1048, 1.0222](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.9217 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=23.2437 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (6): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1642, 0.6043](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1595, 0.6278](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1190, 0.4426](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.4666 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.9298 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.0881 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5958 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.0587 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0709, 0.5058](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3172 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.1748 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1002, 0.6699](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.4878 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1194, 1.0115](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=6.5609 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.1748 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (7): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1621, 0.6402](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1712, 0.6015](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0624, 0.3250](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.8818 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.0426 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8084 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.7217 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8457 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0550, 0.6221](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.8872 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3410 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0798, 0.7414](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.8300 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1221, 1.2854](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=11.7479 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.3410 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (8): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1674, 0.5365](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1649, 0.6477](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0949, 0.3530](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.9038 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.0857 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3595 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5520 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=1.8644 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0494, 0.5720](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=6.3703 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.8846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.0888 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0869, 0.5807](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3958 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1237, 1.3025](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.2477 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.0888 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (9): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.7030 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1545, 0.5240](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.7030 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1542, 0.5843](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.7030 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0838, 0.3533](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.8699 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.0611 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3043 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5683 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=2.1081 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0701, 0.5085](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=6.3062 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=22.7030 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3909 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0689, 0.5368](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.6746 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1116, 1.0891](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.0953 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.3909 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (10): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.7367 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1565, 0.5091](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.7367 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1599, 0.5542](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.7367 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0913, 0.2951](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.3141 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=8.3140 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.5668 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.6283 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.3139 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0822, 0.5450](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=5.0508 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=21.7367 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.8718 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0845, 0.5929](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.2979 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1073, 1.0226](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=4.1809 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.8718 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "        (11): QDQBertLayer(\n",
-       "          (attention): QDQBertAttention(\n",
-       "            (self): QDQBertSelfAttention(\n",
-       "              (query): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.6916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1717, 0.5519](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (key): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.6916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1763, 0.5545](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (value): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.6916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.1061, 0.3834](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (matmul_q_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=9.1137 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_k_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=7.2776 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_v_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.7551 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (matmul_a_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=0.5068 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (output): QDQBertSelfOutput(\n",
-       "              (dense): QuantLinear(\n",
-       "                in_features=768, out_features=768, bias=True\n",
-       "                (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.7846 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "                (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0972, 0.5767](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "              )\n",
-       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "              (dropout): Dropout(p=0.1, inplace=False)\n",
-       "              (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=7.4840 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=18.6916 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): QDQBertIntermediate(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=768, out_features=3072, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=12.2354 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0989, 0.3906](3072) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "          )\n",
-       "          (output): QDQBertOutput(\n",
-       "            (dense): QuantLinear(\n",
-       "              in_features=3072, out_features=768, bias=True\n",
-       "              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=3.4522 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "              (_weight_quantizer): TensorQuantizer(8bit fake axis=(0,) amax=[0.0954, 0.9991](768) calibrator=MaxCalibrator scale=1.0 quant)\n",
-       "            )\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (add_local_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=12.8952 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "            (add_residual_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=12.2354 calibrator=HistogramCalibrator scale=1.0 quant)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (pooler): QDQBertPooler(\n",
-       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "      (activation): Tanh()\n",
-       "    )\n",
-       "  )\n",
-       "  (dropout): Dropout(p=0.1, inplace=False)\n",
-       "  (classifier): Linear(in_features=768, out_features=3, bias=True)\n",
-       ")"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained\", num_labels=num_labels)\n",
-    "\n",
-    "calibrate(model=model_q, encoded_dataset=encoded_dataset)\n",
-    "\n",
-    "# count = 0\n",
-    "# for name, mod in model_q.named_modules():\n",
-    "#     if isinstance(mod, pytorch_quantization.nn.TensorQuantizer):\n",
-    "#         print(f\"{name:80} {mod}\")\n",
-    "#         count += 1\n",
-    "# print(f\"{count} TensorQuantizers found in model\")\n",
-    "# model_q.save_pretrained(\"roberta-in-bert-trained-quantized\")"
+    "model_q = calibrate(model=model_q, encoded_dataset=encoded_dataset)"
    ]
   },
   {
@@ -1643,66 +926,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {
     "id": "imY1oC3SIrJf"
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[INFO|trainer.py:437] 2021-12-06 18:34:07,721 >> Using amp half precision backend\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.4492516815662384, 'eval_accuracy': 0.8271013754457464, 'eval_runtime': 46.2281, 'eval_samples_per_second': 212.317, 'eval_steps_per_second': 3.331}\n",
-      "{'eval_loss': 0.4492516815662384, 'eval_accuracy': 0.8271013754457464, 'eval_runtime': 46.2281, 'eval_samples_per_second': 212.317, 'eval_steps_per_second': 3.331}\n",
-      "{'loss': 0.4752, 'learning_rate': 9.188396349413299e-07, 'epoch': 0.08}\n",
-      "{'eval_loss': 0.4362102150917053, 'eval_accuracy': 0.8346408558329088, 'eval_runtime': 46.4717, 'eval_samples_per_second': 211.204, 'eval_steps_per_second': 3.314, 'epoch': 0.08}\n",
-      "{'loss': 0.4643, 'learning_rate': 8.373533246414604e-07, 'epoch': 0.16}\n",
-      "{'eval_loss': 0.42539361119270325, 'eval_accuracy': 0.8370860927152318, 'eval_runtime': 46.5627, 'eval_samples_per_second': 210.791, 'eval_steps_per_second': 3.307, 'epoch': 0.16}\n",
-      "{'loss': 0.4509, 'learning_rate': 7.558670143415907e-07, 'epoch': 0.24}\n",
-      "{'eval_loss': 0.42584264278411865, 'eval_accuracy': 0.8367804381049414, 'eval_runtime': 46.5106, 'eval_samples_per_second': 211.027, 'eval_steps_per_second': 3.311, 'epoch': 0.24}\n",
-      "{'loss': 0.4454, 'learning_rate': 6.743807040417211e-07, 'epoch': 0.33}\n",
-      "{'eval_loss': 0.427680641412735, 'eval_accuracy': 0.8410596026490066, 'eval_runtime': 46.5186, 'eval_samples_per_second': 210.991, 'eval_steps_per_second': 3.311, 'epoch': 0.33}\n",
-      "{'loss': 0.4486, 'learning_rate': 5.928943937418514e-07, 'epoch': 0.41}\n",
-      "{'eval_loss': 0.419879287481308, 'eval_accuracy': 0.8401426388181356, 'eval_runtime': 46.4807, 'eval_samples_per_second': 211.163, 'eval_steps_per_second': 3.313, 'epoch': 0.41}\n",
-      "{'loss': 0.444, 'learning_rate': 5.114895697522818e-07, 'epoch': 0.49}\n",
-      "{'eval_loss': 0.42938971519470215, 'eval_accuracy': 0.8374936321956189, 'eval_runtime': 46.467, 'eval_samples_per_second': 211.225, 'eval_steps_per_second': 3.314, 'epoch': 0.49}\n",
-      "{'loss': 0.442, 'learning_rate': 4.30003259452412e-07, 'epoch': 0.57}\n",
-      "{'eval_loss': 0.4225366413593292, 'eval_accuracy': 0.8381049414161997, 'eval_runtime': 46.5078, 'eval_samples_per_second': 211.04, 'eval_steps_per_second': 3.311, 'epoch': 0.57}\n",
-      "{'loss': 0.4463, 'learning_rate': 3.485169491525424e-07, 'epoch': 0.65}\n",
-      "{'eval_loss': 0.423688679933548, 'eval_accuracy': 0.8393275598573612, 'eval_runtime': 46.4966, 'eval_samples_per_second': 211.09, 'eval_steps_per_second': 3.312, 'epoch': 0.65}\n",
-      "{'loss': 0.4488, 'learning_rate': 2.671121251629727e-07, 'epoch': 0.73}\n",
-      "{'eval_loss': 0.4213014543056488, 'eval_accuracy': 0.8401426388181356, 'eval_runtime': 46.5212, 'eval_samples_per_second': 210.979, 'eval_steps_per_second': 3.31, 'epoch': 0.73}\n",
-      "{'loss': 0.4354, 'learning_rate': 1.8562581486310303e-07, 'epoch': 0.81}\n",
-      "{'eval_loss': 0.4192813038825989, 'eval_accuracy': 0.8407539480387163, 'eval_runtime': 48.5842, 'eval_samples_per_second': 202.02, 'eval_steps_per_second': 3.17, 'epoch': 0.81}\n",
-      "{'loss': 0.4344, 'learning_rate': 1.0422099087353327e-07, 'epoch': 0.9}\n",
-      "{'eval_loss': 0.41954925656318665, 'eval_accuracy': 0.8381049414161997, 'eval_runtime': 48.5554, 'eval_samples_per_second': 202.14, 'eval_steps_per_second': 3.172, 'epoch': 0.9}\n",
-      "{'loss': 0.436, 'learning_rate': 2.2734680573663627e-08, 'epoch': 0.98}\n",
-      "{'eval_loss': 0.41829705238342285, 'eval_accuracy': 0.8401426388181356, 'eval_runtime': 46.6717, 'eval_samples_per_second': 210.299, 'eval_steps_per_second': 3.3, 'epoch': 0.98}\n",
-      "{'train_runtime': 4966.1274, 'train_samples_per_second': 79.076, 'train_steps_per_second': 2.471, 'train_loss': 0.4474433752206656, 'epoch': 1.0}\n",
-      "{'eval_loss': 0.427680641412735, 'eval_accuracy': 0.8410596026490066, 'eval_runtime': 46.5232, 'eval_samples_per_second': 210.97, 'eval_steps_per_second': 3.31, 'epoch': 1.0}\n",
-      "{'eval_loss': 0.427680641412735, 'eval_accuracy': 0.8410596026490066, 'eval_runtime': 46.5232, 'eval_samples_per_second': 210.97, 'eval_steps_per_second': 3.31, 'epoch': 1.0}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized\", num_labels=num_labels)\n",
     "model_q = model_q.cuda()\n",
     "\n",
-    "args.learning_rate /= 10\n",
+    "args.learning_rate = 1e-6\n",
     "trainer = Trainer(\n",
     "    model_q,\n",
     "    args,\n",
     "    train_dataset=encoded_dataset[\"train\"],\n",
     "    eval_dataset=encoded_dataset[validation_key],\n",
     "    tokenizer=tokenizer,\n",
-    "    compute_metrics=compute_metrics\n",
+    "    compute_metrics=compute_metrics,\n",
     ")\n",
     "transformers.logging.set_verbosity_error()\n",
     "print(trainer.evaluate())\n",
@@ -1729,31 +969,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization/nn/modules/tensor_quantizer.py:285: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  inputs, amax.item() / bound, 0,\n",
-      "/home/geantvert/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/pytorch_quantization/nn/modules/tensor_quantizer.py:291: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  quant_dim = list(amax.shape).index(list(amax_sequeeze.shape)[0])\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "data = encoded_dataset[\"train\"][0: 3]\n",
-    "input_torch = {k: torch.tensor(v, dtype=torch.long, device=\"cuda\") for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
+    "data = encoded_dataset[\"train\"][0:3]\n",
+    "input_torch = convert_tensor(data, output=\"torch\")\n",
     "\n",
-    "from pytorch_quantization.nn import TensorQuantizer\n",
-    "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized-bis\", num_labels=num_labels)\n",
+    "model_q = QDQBertForSequenceClassification.from_pretrained(\n",
+    "    \"roberta-in-bert-trained-quantized-bis\", num_labels=num_labels\n",
+    ")\n",
     "model_q = model_q.cuda()\n",
+    "from pytorch_quantization.nn import TensorQuantizer\n",
+    "\n",
     "TensorQuantizer.use_fb_fake_quant = True\n",
     "convert_to_onnx(model_q, output_path=\"model_q.onnx\", inputs_pytorch=input_torch)\n",
     "TensorQuantizer.use_fb_fake_quant = False\n",
-    "del model_q"
+    "# del model_q"
    ]
   },
   {
@@ -1769,7 +1001,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 14,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1790,6 +1022,21 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# same thing from command line\n",
+    "# !/usr/src/tensorrt/bin/trtexec --onnx=model_q.onnx --shapes=input_ids:32x256,attention_mask:32x256 --int8 --workspace=10000 --saveEngine=\"test.plan\""
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1803,7 +1050,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 16,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1811,18 +1058,27 @@
    },
    "outputs": [],
    "source": [
-    "profile_index = 0\n",
-    "np_input = {\"input_ids\": np.random.randint(1, 10000, size=(batch_size, max_seq_len), dtype=np.int64),\n",
-    " \"attention_mask\": np.ones(shape=(batch_size, max_seq_len), dtype=np.int64),\n",
-    "            }\n",
-    "\n",
     "stream: Stream = pycuda.driver.Stream()\n",
-    "\n",
     "context: IExecutionContext = engine.create_execution_context()\n",
     "context.set_optimization_profile_async(profile_index=profile_index, stream_handle=stream.handle)\n",
     "input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)  # type: List[int], List[int]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data = encoded_dataset[\"train\"][0:batch_size]\n",
+    "input_torch: Dict[str, torch.Tensor] = convert_tensor(data, output=\"torch\")\n",
+    "input_np: Dict[str, np.ndarray] = convert_tensor(data, output=\"np\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1834,9 +1090,16 @@
     "#### Inference on `TensorRT`"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We first check that inference is working correctly:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 18,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1847,45 +1110,45 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[array([[ 0.1358001 , -1.4377486 ,  1.3672757 ],\n",
-      "       [-0.16206698, -1.149481  ,  1.4266016 ],\n",
-      "       [ 0.0163878 , -1.0470941 ,  1.2498031 ],\n",
-      "       [-0.21079333, -0.91275144,  1.2614312 ],\n",
-      "       [ 0.13416213, -1.2132894 ,  1.0915226 ],\n",
-      "       [-0.23387383, -0.6663823 ,  1.0708152 ],\n",
-      "       [-0.4426742 , -0.64095986,  0.6767337 ],\n",
-      "       [-0.39520252, -0.6310587 ,  1.162437  ],\n",
-      "       [-0.11956491, -0.9094458 ,  1.2330313 ],\n",
-      "       [-0.34652767, -0.56745625,  1.1321819 ],\n",
-      "       [-0.3788384 , -0.9477967 ,  1.3850961 ],\n",
-      "       [-1.079162  ,  0.04613969,  0.9176692 ],\n",
-      "       [-0.12555303, -0.8791798 ,  1.2635291 ],\n",
-      "       [-0.12463601, -0.63906515,  0.95351076],\n",
-      "       [ 0.31858096, -0.410717  ,  0.69519377],\n",
-      "       [ 0.07587517, -0.58817637,  0.82071406],\n",
-      "       [ 0.1137608 , -0.8322618 ,  0.6675602 ],\n",
-      "       [-0.50839895, -0.8443974 ,  1.462322  ],\n",
-      "       [-0.14658742, -1.1222454 ,  1.3913041 ],\n",
-      "       [ 0.05990895, -1.4671483 ,  1.5297441 ],\n",
-      "       [ 0.17553274, -0.26642302,  0.67778957],\n",
-      "       [ 0.14809372, -1.3270702 ,  1.1495501 ],\n",
-      "       [-0.1042301 , -0.8665275 ,  0.90043837],\n",
-      "       [-0.78590935, -0.6129427 ,  0.9732029 ],\n",
-      "       [-0.19332369, -0.8912125 ,  1.1381842 ],\n",
-      "       [ 0.50638545, -0.9965472 ,  0.69867384],\n",
-      "       [-0.0973227 , -0.8511242 ,  1.2328701 ],\n",
-      "       [ 0.16307044, -1.1843398 ,  1.437165  ],\n",
-      "       [-0.6260487 , -0.5227167 ,  1.247594  ],\n",
-      "       [-0.30106562, -0.6723875 ,  1.1667051 ],\n",
-      "       [ 0.01060311, -1.1707903 ,  1.3197892 ],\n",
-      "       [-0.22743034, -0.99327207,  0.9541633 ]], dtype=float32)]\n"
+      "[array([[ 5.6351620e-01,  1.4767665e+00, -2.0787194e+00],\n",
+      "       [ 2.3301950e+00, -1.0177574e+00, -1.3668290e+00],\n",
+      "       [ 1.5580183e+00, -5.9583592e-01, -1.1548299e+00],\n",
+      "       [ 1.9603873e+00, -2.0616996e-01, -1.8071964e+00],\n",
+      "       [ 2.4737215e+00, -2.8350648e-01, -1.8850105e+00],\n",
+      "       [ 3.6134090e+00, -1.3006018e-01, -2.7839746e+00],\n",
+      "       [-3.1495490e+00,  2.4353392e+00,  1.9919875e-01],\n",
+      "       [ 3.3117905e+00, -7.3505348e-01, -2.0925450e+00],\n",
+      "       [ 3.2750502e-01, -1.5198725e+00,  1.2251633e+00],\n",
+      "       [-3.7192254e+00, -5.1082242e-01,  3.6361742e+00],\n",
+      "       [ 3.1723669e+00, -6.5267378e-01, -2.1629393e+00],\n",
+      "       [-1.5052840e+00, -1.1153723e+00,  2.1314652e+00],\n",
+      "       [-2.7875674e+00,  3.3702278e+00, -9.6062738e-01],\n",
+      "       [-2.1700766e+00,  2.1553783e+00, -4.1763881e-01],\n",
+      "       [-1.2523253e-01, -9.4394463e-01,  8.0471390e-01],\n",
+      "       [ 2.3903012e+00, -1.0954552e+00, -1.0219078e+00],\n",
+      "       [ 3.7135108e+00, -6.1678243e-01, -2.5324042e+00],\n",
+      "       [-2.8983197e+00, -1.9243273e+00,  4.2502666e+00],\n",
+      "       [-3.1470397e+00, -1.6737628e+00,  4.2269526e+00],\n",
+      "       [-3.1141593e+00,  3.4683597e+00, -7.6941836e-01],\n",
+      "       [ 3.8057449e+00, -6.6588068e-01, -2.4926093e+00],\n",
+      "       [ 2.6230648e+00,  2.3657779e-01, -2.3784602e+00],\n",
+      "       [-2.1757143e+00,  3.6484423e-01,  1.2388697e+00],\n",
+      "       [ 3.7942352e+00, -4.8870793e-01, -2.6957376e+00],\n",
+      "       [ 3.6744323e+00, -1.3134056e+00, -1.7758287e+00],\n",
+      "       [-1.1789101e+00,  1.9029677e-01,  4.4781533e-01],\n",
+      "       [-2.7335472e+00,  1.4046015e+00,  8.6339402e-01],\n",
+      "       [-1.3156077e+00,  1.9026613e+00, -7.8255135e-01],\n",
+      "       [-3.3838544e+00, -6.8161070e-01,  3.4489069e+00],\n",
+      "       [ 2.7053127e+00, -4.8565903e-01, -2.0700452e+00],\n",
+      "       [ 2.7803206e+00, -4.0302199e-01, -2.2101507e+00],\n",
+      "       [ 3.1589518e+00, -1.0739815e-03, -2.7553422e+00]], dtype=float32)]\n"
      ]
     }
    ],
    "source": [
     "tensorrt_output = infer_tensorrt(\n",
     "    context=context,\n",
-    "    host_inputs=np_input,\n",
+    "    host_inputs=input_np,\n",
     "    input_binding_idxs=input_binding_idxs,\n",
     "    output_binding_idxs=output_binding_idxs,\n",
     "    stream=stream,\n",
@@ -1895,22 +1158,46 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
+   "metadata": {},
    "source": [
-    "#### Conversion with `trtexec` (command line approach)"
+    "We warmup the GPU with few inferences and then start the measures:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 19,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[TensorRT (FP16)] mean=15.83ms, sd=1.68ms, min=14.22ms, max=25.70ms, median=15.09ms, 95p=17.72ms, 99p=19.29ms\n"
+     ]
+    }
+   ],
    "source": [
-    "#!/usr/src/tensorrt/bin/trtexec --onnx=model_q.onnx --shapes=input_ids:32x256,attention_mask:32x256 --int8 --workspace=6000  --saveEngine=\"test.plan\""
+    "for _ in range(30):\n",
+    "    _ = infer_tensorrt(\n",
+    "        context=context,\n",
+    "        host_inputs=input_np,\n",
+    "        input_binding_idxs=input_binding_idxs,\n",
+    "        output_binding_idxs=output_binding_idxs,\n",
+    "        stream=stream,\n",
+    "    )\n",
+    "time_buffer = list()\n",
+    "for _ in range(100):\n",
+    "    with track_infer_time(time_buffer):\n",
+    "        _ = infer_tensorrt(\n",
+    "            context=context,\n",
+    "            host_inputs=input_np,\n",
+    "            input_binding_idxs=input_binding_idxs,\n",
+    "            output_binding_idxs=output_binding_idxs,\n",
+    "            stream=stream,\n",
+    "        )\n",
+    "\n",
+    "print_timings(name=\"TensorRT (INT-8)\", timings=time_buffer)\n",
+    "del engine, context  # delete all tensorrt objects"
    ]
   },
   {
@@ -1923,97 +1210,57 @@
    "source": [
     "## Method 2: use a dedicated QDQ model\n",
     "\n",
-    "In method 2, the idea is to take the source code of a specific model and add manually in the source code `QDQ` nodes. That way, quantization will work out of the box. Even if `Bert` has many variations, it seems that very few of them are really used. Hugging Face transformers library include `Bert` model.\n",
-    "Our library offer a dedicated implementation of `Roberta`.\n",
+    "In method 2, the idea is to take the source code of a specific model and add manually in the source code `QDQ` nodes. That way, quantization will work out of the box for this architecture.\n",
+    "We have started with `QDQRoberta` a quantization compliant `Roberta` model.\n",
     "\n",
-    "To adapt another architecture, you need to:\n",
+    "To adapt to another architecture, one need to:\n",
     "\n",
-    "* replaced linear layers with their quantized version\n",
-    "* replace operations not supported out of the box by TensorRT by a similar code supporting the operation.\n",
+    "* replace linear layers with their quantized version\n",
+    "* replace operations not supported out of the box by `TensorRT` by a similar code supporting the operation.\n",
     "\n",
-    "> it's not a complex process, but it requires some knowledge of `ONNX` supported operations and `TensorRT` framework\n",
+    "> it's not a complex process, but it requires some knowledge of both `ONNX` supported operations and `TensorRT` framework\n",
     "\n",
     "The process below is a bit simpler than the method 1:\n",
     "\n",
-    "* finetune the QDQ model on the task (Quantization Aware Training)\n",
+    "<!-- * finetune the QDQ model on the task (Quantization Aware Training) -->\n",
     "* calibrate\n",
     "* Quantization Aware training (QAT)\n",
     "\n",
-    "> you may skip step 1/ if you want\n",
     "\n",
     "### Fine tuning the model"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[INFO|trainer.py:437] 2021-12-06 20:38:02,464 >> Using amp half precision backend\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'loss': 0.6886, 'learning_rate': 9.188396349413299e-06, 'epoch': 0.08}\n",
-      "{'eval_loss': 0.4678966999053955, 'eval_accuracy': 0.8171166581762608, 'eval_runtime': 18.7354, 'eval_samples_per_second': 523.874, 'eval_steps_per_second': 8.22, 'epoch': 0.08}\n",
-      "{'loss': 0.5021, 'learning_rate': 8.373533246414604e-06, 'epoch': 0.16}\n",
-      "{'eval_loss': 0.4271945059299469, 'eval_accuracy': 0.8333163525216505, 'eval_runtime': 18.5466, 'eval_samples_per_second': 529.209, 'eval_steps_per_second': 8.303, 'epoch': 0.16}\n",
-      "{'loss': 0.4682, 'learning_rate': 7.558670143415907e-06, 'epoch': 0.24}\n",
-      "{'eval_loss': 0.4240091145038605, 'eval_accuracy': 0.8358634742740703, 'eval_runtime': 18.6916, 'eval_samples_per_second': 525.101, 'eval_steps_per_second': 8.239, 'epoch': 0.24}\n",
-      "{'loss': 0.4491, 'learning_rate': 6.743807040417211e-06, 'epoch': 0.33}\n",
-      "{'eval_loss': 0.38295766711235046, 'eval_accuracy': 0.8523688232297504, 'eval_runtime': 18.6766, 'eval_samples_per_second': 525.524, 'eval_steps_per_second': 8.246, 'epoch': 0.33}\n",
-      "{'loss': 0.4292, 'learning_rate': 5.9289439374185145e-06, 'epoch': 0.41}\n",
-      "{'eval_loss': 0.3819591999053955, 'eval_accuracy': 0.8519612837493632, 'eval_runtime': 19.1793, 'eval_samples_per_second': 511.75, 'eval_steps_per_second': 8.029, 'epoch': 0.41}\n",
-      "{'loss': 0.4188, 'learning_rate': 5.114080834419818e-06, 'epoch': 0.49}\n",
-      "{'eval_loss': 0.3905084729194641, 'eval_accuracy': 0.8507386653082017, 'eval_runtime': 18.5694, 'eval_samples_per_second': 528.559, 'eval_steps_per_second': 8.293, 'epoch': 0.49}\n",
-      "{'loss': 0.4171, 'learning_rate': 4.30003259452412e-06, 'epoch': 0.57}\n",
-      "{'eval_loss': 0.36459046602249146, 'eval_accuracy': 0.8601120733571065, 'eval_runtime': 18.5686, 'eval_samples_per_second': 528.579, 'eval_steps_per_second': 8.294, 'epoch': 0.57}\n",
-      "{'loss': 0.4118, 'learning_rate': 3.4851694915254244e-06, 'epoch': 0.65}\n",
-      "{'eval_loss': 0.35626235604286194, 'eval_accuracy': 0.8616403464085584, 'eval_runtime': 18.5178, 'eval_samples_per_second': 530.029, 'eval_steps_per_second': 8.316, 'epoch': 0.65}\n",
-      "{'loss': 0.4006, 'learning_rate': 2.670306388526728e-06, 'epoch': 0.73}\n",
-      "{'eval_loss': 0.3605223596096039, 'eval_accuracy': 0.8653082017320428, 'eval_runtime': 18.6003, 'eval_samples_per_second': 527.68, 'eval_steps_per_second': 8.279, 'epoch': 0.73}\n",
-      "{'loss': 0.3936, 'learning_rate': 1.8570730117340288e-06, 'epoch': 0.81}\n",
-      "{'eval_loss': 0.3559686243534088, 'eval_accuracy': 0.8653082017320428, 'eval_runtime': 18.5309, 'eval_samples_per_second': 529.656, 'eval_steps_per_second': 8.31, 'epoch': 0.81}\n",
-      "{'loss': 0.3945, 'learning_rate': 1.0422099087353325e-06, 'epoch': 0.9}\n",
-      "{'eval_loss': 0.3518819212913513, 'eval_accuracy': 0.8659195109526235, 'eval_runtime': 18.5189, 'eval_samples_per_second': 529.998, 'eval_steps_per_second': 8.316, 'epoch': 0.9}\n",
-      "{'loss': 0.3977, 'learning_rate': 2.2734680573663624e-07, 'epoch': 0.98}\n",
-      "{'eval_loss': 0.34959253668785095, 'eval_accuracy': 0.8677534386143657, 'eval_runtime': 18.5328, 'eval_samples_per_second': 529.602, 'eval_steps_per_second': 8.31, 'epoch': 0.98}\n",
-      "{'train_runtime': 2665.1824, 'train_samples_per_second': 147.345, 'train_steps_per_second': 4.605, 'train_loss': 0.44651927413343606, 'epoch': 1.0}\n",
-      "{'eval_loss': 0.34959253668785095, 'eval_accuracy': 0.8677534386143657, 'eval_runtime': 18.4913, 'eval_samples_per_second': 530.789, 'eval_steps_per_second': 8.328, 'epoch': 1.0}\n",
-      "{'eval_loss': 0.34959253668785095, 'eval_accuracy': 0.8677534386143657, 'eval_runtime': 18.4913, 'eval_samples_per_second': 530.789, 'eval_steps_per_second': 8.328, 'epoch': 1.0}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
-    "model_roberta = model_roberta.cuda()\n",
+    "# model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(\n",
+    "#     model_checkpoint, num_labels=num_labels\n",
+    "# )\n",
+    "# model_roberta = model_roberta.cuda()\n",
     "\n",
-    "args.learning_rate = 1e-5\n",
-    "trainer = Trainer(\n",
-    "    model_roberta,\n",
-    "    args,\n",
-    "    train_dataset=encoded_dataset[\"train\"],\n",
-    "    eval_dataset=encoded_dataset[validation_key],\n",
-    "    tokenizer=tokenizer,\n",
-    "    compute_metrics=compute_metrics\n",
-    ")\n",
-    "transformers.logging.set_verbosity_error()\n",
-    "trainer.train()\n",
-    "print(trainer.evaluate())\n",
-    "# {'eval_loss': 0.3559744358062744, 'eval_accuracy': 0.8655119714722364, 'eval_runtime': 19.6678, 'eval_samples_per_second': 499.04, 'eval_steps_per_second': 7.83, 'epoch': 0.98}\n",
-    "trainer.save_model(\"roberta-model\")\n",
-    "del model_roberta\n",
-    "del trainer"
+    "# args.learning_rate = 1e-5\n",
+    "# trainer = Trainer(\n",
+    "#     model_roberta,\n",
+    "#     args,\n",
+    "#     train_dataset=encoded_dataset[\"train\"],\n",
+    "#     eval_dataset=encoded_dataset[validation_key],\n",
+    "#     tokenizer=tokenizer,\n",
+    "#     compute_metrics=compute_metrics,\n",
+    "# )\n",
+    "# transformers.logging.set_verbosity_error()\n",
+    "# trainer.train()\n",
+    "# print(trainer.evaluate())\n",
+    "# # {'eval_loss': 0.3559744358062744, 'eval_accuracy': 0.8655119714722364, 'eval_runtime': 19.6678, 'eval_samples_per_second': 499.04, 'eval_steps_per_second': 7.83, 'epoch': 0.98}\n",
+    "# trainer.save_model(\"roberta-model\")\n",
+    "# del model_roberta\n",
+    "# del trainer"
    ]
   },
   {
@@ -2029,7 +1276,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 51,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -2039,7 +1286,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e696905fbfdf4a149cb2437482b20cf1",
+       "model_id": "2614c39f73cb4a219791b89922f11c36",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2053,7 +1300,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e1369f4ac82f4f90b2194d209dc1c8cd",
+       "model_id": "b43fc12b3f1d443a8a4079e8c7900ace",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2066,7 +1313,6 @@
     }
    ],
    "source": [
-    "\n",
     "input_desc = QuantDescriptor(num_bits=8, calib_method=\"histogram\")\n",
     "# below we do per-channel quantization for weights, set axis to None to get a per tensor calibration\n",
     "weight_desc = QuantDescriptor(num_bits=8, axis=(0,))\n",
@@ -2074,12 +1320,16 @@
     "quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)\n",
     "\n",
     "# keep it on CPU\n",
-    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\"roberta-model\")\n",
-    "calibrate(model=model_roberta_q, encoded_dataset=encoded_dataset)\n",
-    "\n",
+    "model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_checkpoint, num_labels=num_labels\n",
+    ")\n",
+    "model_roberta.save_pretrained(\"roberta-untrained-quantized\")\n",
+    "del model_roberta\n",
+    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\"roberta-untrained-quantized\")\n",
+    "model_roberta_q = calibrate(model=model_roberta_q, encoded_dataset=encoded_dataset)\n",
     "\n",
-    "model_roberta_q.save_pretrained(\"roberta-trained-quantized\")\n",
-    "del model_roberta_q\n"
+    "model_roberta_q.save_pretrained(\"roberta-untrained-quantized\")\n",
+    "del model_roberta_q"
    ]
   },
   {
@@ -2095,7 +1345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 52,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -2106,71 +1356,55 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[INFO|trainer.py:437] 2021-12-06 21:28:16,421 >> Using amp half precision backend\n"
+      "[INFO|trainer.py:437] 2021-12-07 22:51:46,364 >> Using amp half precision backend\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "LR: 1.0000000000000002e-06\n",
-      "{'eval_loss': 0.38657698035240173, 'eval_accuracy': 0.8526744778400408, 'eval_runtime': 47.6064, 'eval_samples_per_second': 206.17, 'eval_steps_per_second': 3.235}\n",
-      "{'eval_loss': 0.38657698035240173, 'eval_accuracy': 0.8526744778400408, 'eval_runtime': 47.6064, 'eval_samples_per_second': 206.17, 'eval_steps_per_second': 3.235}\n",
-      "{'loss': 0.4018, 'learning_rate': 9.187581486310301e-07, 'epoch': 0.08}\n",
-      "{'eval_loss': 0.38418063521385193, 'eval_accuracy': 0.8558329088130413, 'eval_runtime': 46.6509, 'eval_samples_per_second': 210.393, 'eval_steps_per_second': 3.301, 'epoch': 0.08}\n",
-      "{'loss': 0.3954, 'learning_rate': 8.373533246414604e-07, 'epoch': 0.16}\n",
-      "{'eval_loss': 0.3795166015625, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.5562, 'eval_samples_per_second': 210.821, 'eval_steps_per_second': 3.308, 'epoch': 0.16}\n",
-      "{'loss': 0.3916, 'learning_rate': 7.558670143415907e-07, 'epoch': 0.24}\n",
-      "{'eval_loss': 0.3784726560115814, 'eval_accuracy': 0.8558329088130413, 'eval_runtime': 46.5355, 'eval_samples_per_second': 210.914, 'eval_steps_per_second': 3.309, 'epoch': 0.24}\n",
-      "{'loss': 0.3909, 'learning_rate': 6.743807040417211e-07, 'epoch': 0.33}\n",
-      "{'eval_loss': 0.38643816113471985, 'eval_accuracy': 0.8565461029037188, 'eval_runtime': 46.544, 'eval_samples_per_second': 210.876, 'eval_steps_per_second': 3.309, 'epoch': 0.33}\n",
-      "{'loss': 0.3932, 'learning_rate': 5.928943937418514e-07, 'epoch': 0.41}\n",
-      "{'eval_loss': 0.3807451128959656, 'eval_accuracy': 0.8582781456953642, 'eval_runtime': 46.5617, 'eval_samples_per_second': 210.796, 'eval_steps_per_second': 3.307, 'epoch': 0.41}\n",
-      "{'loss': 0.3894, 'learning_rate': 5.114895697522818e-07, 'epoch': 0.49}\n",
-      "{'eval_loss': 0.3824027180671692, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 46.5541, 'eval_samples_per_second': 210.83, 'eval_steps_per_second': 3.308, 'epoch': 0.49}\n",
-      "{'loss': 0.3895, 'learning_rate': 4.3008474576271193e-07, 'epoch': 0.57}\n",
-      "{'eval_loss': 0.3791654407978058, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 46.5392, 'eval_samples_per_second': 210.897, 'eval_steps_per_second': 3.309, 'epoch': 0.57}\n",
-      "{'loss': 0.388, 'learning_rate': 3.4859843546284233e-07, 'epoch': 0.65}\n",
-      "{'eval_loss': 0.3764157295227051, 'eval_accuracy': 0.8595007641365258, 'eval_runtime': 47.0386, 'eval_samples_per_second': 208.659, 'eval_steps_per_second': 3.274, 'epoch': 0.65}\n",
-      "{'loss': 0.3928, 'learning_rate': 2.671121251629727e-07, 'epoch': 0.73}\n",
-      "{'eval_loss': 0.37711256742477417, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 48.7144, 'eval_samples_per_second': 201.48, 'eval_steps_per_second': 3.161, 'epoch': 0.73}\n",
-      "{'loss': 0.381, 'learning_rate': 1.857073011734029e-07, 'epoch': 0.81}\n",
-      "{'eval_loss': 0.38059118390083313, 'eval_accuracy': 0.8595007641365258, 'eval_runtime': 47.0072, 'eval_samples_per_second': 208.798, 'eval_steps_per_second': 3.276, 'epoch': 0.81}\n",
-      "{'loss': 0.3798, 'learning_rate': 1.0422099087353327e-07, 'epoch': 0.9}\n",
-      "{'eval_loss': 0.3735353648662567, 'eval_accuracy': 0.8599083036169128, 'eval_runtime': 48.5826, 'eval_samples_per_second': 202.027, 'eval_steps_per_second': 3.17, 'epoch': 0.9}\n",
-      "{'loss': 0.3823, 'learning_rate': 2.2734680573663627e-08, 'epoch': 0.98}\n",
-      "{'eval_loss': 0.3766668438911438, 'eval_accuracy': 0.8596026490066225, 'eval_runtime': 48.2033, 'eval_samples_per_second': 203.617, 'eval_steps_per_second': 3.195, 'epoch': 0.98}\n",
-      "{'train_runtime': 5010.7316, 'train_samples_per_second': 78.372, 'train_steps_per_second': 2.449, 'train_loss': 0.3895211076798619, 'epoch': 1.0}\n",
-      "{'eval_loss': 0.3824027180671692, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 47.2938, 'eval_samples_per_second': 207.532, 'eval_steps_per_second': 3.256, 'epoch': 1.0}\n",
-      "{'eval_loss': 0.3824027180671692, 'eval_accuracy': 0.8613346917982679, 'eval_runtime': 47.2938, 'eval_samples_per_second': 207.532, 'eval_steps_per_second': 3.256, 'epoch': 1.0}\n"
+      "{'loss': 0.7546, 'learning_rate': 9.1875814863103e-06, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.5080597400665283, 'eval_accuracy': 0.8026490066225166, 'eval_runtime': 46.668, 'eval_samples_per_second': 210.315, 'eval_steps_per_second': 3.3, 'epoch': 0.08}\n",
+      "{'loss': 0.5466, 'learning_rate': 8.372718383311604e-06, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.452316552400589, 'eval_accuracy': 0.8242485990830362, 'eval_runtime': 46.9082, 'eval_samples_per_second': 209.238, 'eval_steps_per_second': 3.283, 'epoch': 0.16}\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
+      "\u001B[0;32m/tmp/ipykernel_1192642/1887935502.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m     14\u001B[0m )\n\u001B[1;32m     15\u001B[0m \u001B[0mtransformers\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlogging\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_verbosity_error\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 16\u001B[0;31m \u001B[0mtrainer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrain\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     17\u001B[0m \u001B[0mprint\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mevaluate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     18\u001B[0m \u001B[0;31m# {'eval_loss': 0.40235549211502075, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.1754, 'eval_samples_per_second': 212.559, 'eval_steps_per_second': 3.335, 'epoch': 1.0}\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/transformers/trainer.py\u001B[0m in \u001B[0;36mtrain\u001B[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001B[0m\n\u001B[1;32m   1321\u001B[0m                         \u001B[0mtr_loss_step\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1322\u001B[0m                 \u001B[0;32melse\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1323\u001B[0;31m                     \u001B[0mtr_loss_step\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m   1324\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1325\u001B[0m                 if (\n",
+      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/transformers/trainer.py\u001B[0m in \u001B[0;36mtraining_step\u001B[0;34m(self, model, inputs)\u001B[0m\n\u001B[1;32m   1875\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1876\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdo_grad_scaling\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1877\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscaler\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscale\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mloss\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mbackward\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m   1878\u001B[0m         \u001B[0;32melif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0muse_apex\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1879\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mamp\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscale_loss\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mloss\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0moptimizer\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mscaled_loss\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/torch/_tensor.py\u001B[0m in \u001B[0;36mbackward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m    305\u001B[0m                 \u001B[0mcreate_graph\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    306\u001B[0m                 inputs=inputs)\n\u001B[0;32m--> 307\u001B[0;31m         \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mautograd\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mbackward\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgradient\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mretain_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    308\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    309\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mregister_hook\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mhook\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/torch/autograd/__init__.py\u001B[0m in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m    152\u001B[0m         \u001B[0mretain_graph\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 154\u001B[0;31m     Variable._execution_engine.run_backward(\n\u001B[0m\u001B[1;32m    155\u001B[0m         \u001B[0mtensors\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgrad_tensors_\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mretain_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    156\u001B[0m         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag\n",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
      ]
     }
    ],
    "source": [
-    "\n",
-    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\"roberta-trained-quantized\", num_labels=num_labels)\n",
+    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\n",
+    "    \"roberta-untrained-quantized\", num_labels=num_labels\n",
+    ")\n",
     "model_roberta_q = model_roberta_q.cuda()\n",
     "\n",
-    "args.learning_rate /= 10\n",
-    "print(f\"LR: {args.learning_rate}\")\n",
+    "args.learning_rate = 1e-5\n",
     "trainer = Trainer(\n",
     "    model_roberta_q,\n",
     "    args,\n",
     "    train_dataset=encoded_dataset[\"train\"],\n",
     "    eval_dataset=encoded_dataset[validation_key],\n",
     "    tokenizer=tokenizer,\n",
-    "    compute_metrics=compute_metrics\n",
+    "    compute_metrics=compute_metrics,\n",
     ")\n",
     "transformers.logging.set_verbosity_error()\n",
-    "print(trainer.evaluate())\n",
-    "# 4 batches\n",
-    "# {'eval_loss': 0.38076257705688477, 'eval_accuracy': 0.8552215995924605, 'eval_runtime': 46.9577, 'eval_samples_per_second': 209.018, 'eval_steps_per_second': 3.28}\n",
-    "# 100 batches\n",
-    "# {'eval_loss': 0.386756956577301, 'eval_accuracy': 0.8516556291390729, 'eval_runtime': 48.9996, 'eval_samples_per_second': 200.308, 'eval_steps_per_second': 3.143}\n",
     "trainer.train()\n",
     "print(trainer.evaluate())\n",
     "# {'eval_loss': 0.40235549211502075, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.1754, 'eval_samples_per_second': 212.559, 'eval_steps_per_second': 3.335, 'epoch': 1.0}\n",
-    "model_roberta_q.save_pretrained(\"roberta-in-bert-trained-quantized-retrained\")\n",
+    "model_roberta_q.save_pretrained(\"roberta-in-bert-trained-quantized\")\n",
     "del model_roberta_q"
    ]
   },
@@ -2178,16 +1412,23 @@
    "cell_type": "markdown",
    "metadata": {
     "pycharm": {
-     "name": "#%%\n"
+     "name": "#%% md\n"
     }
    },
    "source": [
     "### Benchmark"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Export a `QDQ Pytorch` model on `ONNX`, we need to enable fake quantization mode from Pytorch."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 20,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -2206,13 +1447,16 @@
     }
    ],
    "source": [
-    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized-retrained\", num_labels=num_labels)\n",
+    "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\n",
+    "    \"roberta-in-bert-trained-quantized-retrained\", num_labels=num_labels\n",
+    ")\n",
     "model_roberta_q = model_roberta_q.cuda()\n",
     "\n",
-    "data = encoded_dataset[\"train\"][1: 3]\n",
-    "input_torch = {k: torch.tensor(list(v), dtype=torch.long, device=\"cuda\") for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
+    "data = encoded_dataset[\"train\"][1:3]\n",
+    "input_torch = convert_tensor(data, output=\"torch\")\n",
     "\n",
     "from pytorch_quantization.nn import TensorQuantizer\n",
+    "\n",
     "TensorQuantizer.use_fb_fake_quant = True\n",
     "convert_to_onnx(model_pytorch=model_roberta_q, output_path=\"roberta_q.onnx\", inputs_pytorch=input_torch)\n",
     "TensorQuantizer.use_fb_fake_quant = False"
@@ -2222,9 +1466,195 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Latency measures\n",
+    "#### Convert `ONNX` graph to `TensorRT` engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "engine = build_engine(\n",
+    "    runtime=runtime,\n",
+    "    onnx_file_path=\"roberta_q.onnx\",\n",
+    "    logger=trt_logger,\n",
+    "    min_shape=(batch_size, max_seq_len),\n",
+    "    optimal_shape=(batch_size, max_seq_len),\n",
+    "    max_shape=(batch_size, max_seq_len),\n",
+    "    workspace_size=10000 * 1024 * 1024,\n",
+    "    fp16=False,\n",
+    "    int8=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# same conversion from the terminal\n",
+    "#!/usr/src/tensorrt/bin/trtexec --onnx=roberta_q.onnx --shapes=input_ids:32x256,attention_mask:32x256 --int8 --workspace=10000 --saveEngine=\"test.plan\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Prepare input and output buffer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "stream: Stream = pycuda.driver.Stream()\n",
+    "context: IExecutionContext = engine.create_execution_context()\n",
+    "context.set_optimization_profile_async(profile_index=profile_index, stream_handle=stream.handle)\n",
+    "input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)  # type: List[int], List[int]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data = encoded_dataset[\"train\"][0:batch_size]\n",
+    "input_torch: OD[str, torch.Tensor] = convert_tensor(data=data, output=\"torch\")\n",
+    "input_np: OD[str, np.ndarray] = convert_tensor(data=data, output=\"np\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Inference on `TensorRT`\n",
+    "\n",
+    "We first check that inference is working correctly:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[array([[ 0.18246882,  1.1604499 , -1.9064299 ],\n",
+      "       [ 2.2959347 , -0.90284556, -1.7264905 ],\n",
+      "       [ 2.0187902 , -0.773502  , -1.5992272 ],\n",
+      "       [ 1.0055674 , -0.11009257, -1.5396124 ],\n",
+      "       [ 1.859256  , -0.2640643 , -1.9893125 ],\n",
+      "       [ 4.013083  , -1.0762665 , -2.6802182 ],\n",
+      "       [-3.8274715 ,  2.82228   ,  0.53441864],\n",
+      "       [ 3.9432845 , -1.111492  , -2.5691993 ],\n",
+      "       [-0.02036566, -1.372117  ,  0.7166275 ],\n",
+      "       [-3.4707005 , -1.0002614 ,  4.073874  ],\n",
+      "       [ 3.2287133 , -1.2782975 , -2.2006783 ],\n",
+      "       [-2.6840115 , -1.0188127 ,  3.1551964 ],\n",
+      "       [-3.7200396 ,  2.349886  ,  0.5967207 ],\n",
+      "       [-2.813374  ,  2.1468658 , -0.05996893],\n",
+      "       [-0.65687865, -0.6351316 ,  0.39051658],\n",
+      "       [ 2.4374492 , -1.0998904 , -1.6697674 ],\n",
+      "       [ 3.8417404 , -0.5648582 , -2.9105089 ],\n",
+      "       [-2.7713814 , -1.9245013 ,  4.5542636 ],\n",
+      "       [-3.3167183 , -1.1679829 ,  4.150669  ],\n",
+      "       [-2.3994412 ,  3.990843  , -1.8524642 ],\n",
+      "       [ 3.919186  , -0.6086215 , -3.0692425 ],\n",
+      "       [ 2.6670258 ,  0.80300426, -3.5866559 ],\n",
+      "       [-3.1351302 , -0.55978656,  3.1145272 ],\n",
+      "       [ 4.003843  , -0.3124757 , -3.3316247 ],\n",
+      "       [ 3.922188  , -1.144398  , -2.5969653 ],\n",
+      "       [-2.0846744 ,  0.20275442,  0.92436594],\n",
+      "       [-2.3019078 , -0.13688484,  1.519916  ],\n",
+      "       [-1.7067494 ,  3.948385  , -2.4575708 ],\n",
+      "       [-3.6868966 ,  0.2229948 ,  2.682487  ],\n",
+      "       [ 3.1668653 , -0.71280336, -2.5332575 ],\n",
+      "       [ 3.3461478 , -0.47501963, -2.9260926 ],\n",
+      "       [ 3.4306564 , -0.8531854 , -2.635625  ]], dtype=float32)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "tensorrt_output = infer_tensorrt(\n",
+    "    context=context,\n",
+    "    host_inputs=input_np,\n",
+    "    input_binding_idxs=input_binding_idxs,\n",
+    "    output_binding_idxs=output_binding_idxs,\n",
+    "    stream=stream,\n",
+    ")\n",
+    "print(tensorrt_output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We warmup the GPU with few inferences and then start the measures:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[TensorRT (FP16)] mean=14.94ms, sd=0.61ms, min=14.64ms, max=17.95ms, median=14.77ms, 95p=16.19ms, 99p=17.74ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "for _ in range(30):\n",
+    "    _ = infer_tensorrt(\n",
+    "        context=context,\n",
+    "        host_inputs=input_np,\n",
+    "        input_binding_idxs=input_binding_idxs,\n",
+    "        output_binding_idxs=output_binding_idxs,\n",
+    "        stream=stream,\n",
+    "    )\n",
+    "time_buffer = list()\n",
+    "for _ in range(100):\n",
+    "    with track_infer_time(time_buffer):\n",
+    "        _ = infer_tensorrt(\n",
+    "            context=context,\n",
+    "            host_inputs=input_np,\n",
+    "            input_binding_idxs=input_binding_idxs,\n",
+    "            output_binding_idxs=output_binding_idxs,\n",
+    "            stream=stream,\n",
+    "        )\n",
     "\n",
-    "Let's see if what we have done is useful...\n"
+    "print_timings(name=\"TensorRT (INT-8)\", timings=time_buffer)\n",
+    "del engine, context"
    ]
   },
   {
@@ -2238,21 +1668,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Below we export a randomly initialized Roberta model, the purpose is to only check the performance."
+    "Below we export a randomly initialized `Roberta` model, the purpose is to only check the performance on mixed precision (FP16, no quantization)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 30,
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    "data = encoded_dataset[\"train\"][1:10]\n",
-    "input_torch = {k: torch.tensor(list(v), dtype=torch.long, device=\"cuda\")\n",
-    "               for k, v in data.items() if k in [\"input_ids\", \"attention_mask\", \"token_type_ids\"]}\n",
-    "\n",
     "baseline_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
     "baseline_model = baseline_model.cuda()\n",
     "convert_to_onnx(baseline_model, output_path=\"baseline.onnx\", inputs_pytorch=input_torch)\n",
@@ -2261,13 +1687,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 35,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[TensorRT (FP16)] mean=30.17ms, sd=0.46ms, min=29.53ms, max=32.12ms, median=29.97ms, 95p=31.24ms, 99p=31.58ms\n"
+     ]
+    }
+   ],
    "source": [
-    "#!/usr/src/tensorrt/bin/trtexec --onnx=baseline.onnx --shapes=input_ids:1x384,attention_mask:1x384 --best --workspace=6000"
+    "engine = build_engine(\n",
+    "    runtime=runtime,\n",
+    "    onnx_file_path=\"baseline.onnx\",\n",
+    "    logger=trt_logger,\n",
+    "    min_shape=(batch_size, max_seq_len),\n",
+    "    optimal_shape=(batch_size, max_seq_len),\n",
+    "    max_shape=(batch_size, max_seq_len),\n",
+    "    workspace_size=10000 * 1024 * 1024,\n",
+    "    fp16=True,\n",
+    "    int8=False,\n",
+    ")\n",
+    "stream: Stream = pycuda.driver.Stream()\n",
+    "context: IExecutionContext = engine.create_execution_context()\n",
+    "context.set_optimization_profile_async(profile_index=profile_index, stream_handle=stream.handle)\n",
+    "input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)  # type: List[int], List[int]\n",
+    "for _ in range(30):\n",
+    "    _ = infer_tensorrt(\n",
+    "        context=context,\n",
+    "        host_inputs=input_np,\n",
+    "        input_binding_idxs=input_binding_idxs,\n",
+    "        output_binding_idxs=output_binding_idxs,\n",
+    "        stream=stream,\n",
+    "    )\n",
+    "time_buffer = list()\n",
+    "for _ in range(100):\n",
+    "    with track_infer_time(time_buffer):\n",
+    "        _ = infer_tensorrt(\n",
+    "            context=context,\n",
+    "            host_inputs=input_np,\n",
+    "            input_binding_idxs=input_binding_idxs,\n",
+    "            output_binding_idxs=output_binding_idxs,\n",
+    "            stream=stream,\n",
+    "        )\n",
+    "\n",
+    "print_timings(name=\"TensorRT (FP16)\", timings=time_buffer)\n",
+    "del engine, context"
    ]
   },
   {
@@ -2277,6 +1746,72 @@
     "## Pytorch baseline"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To finish, we will measure vanilla Pytorch inference on both FP32 and FP16 precision, it will be our baseline:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Pytorch (FP32)] mean=76.31ms, sd=1.04ms, min=75.32ms, max=82.39ms, median=76.02ms, 95p=78.31ms, 99p=79.18ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "baseline_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
+    "baseline_model = baseline_model.cuda()\n",
+    "\n",
+    "data = encoded_dataset[\"train\"][0:batch_size]\n",
+    "input_torch: OD[str, torch.Tensor] = convert_tensor(data=data, output=\"torch\")\n",
+    "\n",
+    "for _ in range(30):\n",
+    "    _ = baseline_model(**input_torch)\n",
+    "    torch.cuda.synchronize()\n",
+    "time_buffer = list()\n",
+    "for _ in range(100):\n",
+    "    with track_infer_time(time_buffer):\n",
+    "        _ = baseline_model(**input_torch)\n",
+    "        torch.cuda.synchronize()\n",
+    "print_timings(name=\"Pytorch (FP32)\", timings=time_buffer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Pytorch (FP16)] mean=56.04ms, sd=0.56ms, min=55.44ms, max=57.93ms, median=55.88ms, 95p=57.36ms, 99p=57.62ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "from torch.cuda.amp import autocast\n",
+    "\n",
+    "with autocast():\n",
+    "    for _ in range(30):\n",
+    "        _ = baseline_model(**input_torch)\n",
+    "        torch.cuda.synchronize()\n",
+    "    time_buffer = []\n",
+    "    for _ in range(100):\n",
+    "        with track_infer_time(time_buffer):\n",
+    "            _ = baseline_model(**input_torch)\n",
+    "            torch.cuda.synchronize()\n",
+    "print_timings(name=\"Pytorch (FP16)\", timings=time_buffer)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/requirements.txt b/requirements.txt
index 6d955a18..cebff0a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,6 @@ sympy
 coloredlogs
 pytest
 colored
-black
+black[jupyter]
 isort
 flake8
diff --git a/requirements_gpu.txt b/requirements_gpu.txt
index 7a17eabe..e6f536b2 100644
--- a/requirements_gpu.txt
+++ b/requirements_gpu.txt
@@ -8,4 +8,3 @@ nvidia-tensorrt
 onnx_graphsurgeon
 polygraphy
 triton-model-analyzer
-pytorch-quantization
diff --git a/src/transformer_deploy/backends/trt_utils.py b/src/transformer_deploy/backends/trt_utils.py
index 872b6b41..c6ed7058 100644
--- a/src/transformer_deploy/backends/trt_utils.py
+++ b/src/transformer_deploy/backends/trt_utils.py
@@ -77,20 +77,20 @@ def setup_binding_shapes(
     host_inputs: List[np.ndarray],
     input_binding_idxs: List[int],
     output_binding_idxs: List[int],
-):
+) -> Tuple[List[np.ndarray], List[DeviceAllocation]]:
     # explicitly set dynamic input shapes, so dynamic output shapes can be computed internally
     for host_input, binding_index in zip(host_inputs, input_binding_idxs):
         context.set_binding_shape(binding_index, host_input.shape)
     assert context.all_binding_shapes_specified
-    host_outputs = []
-    device_outputs = []
+    host_outputs: List[np.ndarray] = []
+    device_outputs: List[DeviceAllocation] = []
     for binding_index in output_binding_idxs:
         output_shape = context.get_binding_shape(binding_index)
-    # allocate buffers to hold output results after copying back to host
-    buffer = np.empty(output_shape, dtype=np.float32)
-    host_outputs.append(buffer)
-    # allocate output buffers on device
-    device_outputs.append(cuda.mem_alloc(buffer.nbytes))
+        # allocate buffers to hold output results after copying back to host
+        buffer = np.empty(output_shape, dtype=np.float32)
+        host_outputs.append(buffer)
+        # allocate output buffers on device
+        device_outputs.append(cuda.mem_alloc(buffer.nbytes))
     return host_outputs, device_outputs
 
 
@@ -146,8 +146,6 @@ def build_engine(
             with trt.OnnxParser(network_definition, logger) as parser:  # type: OnnxParser
                 builder.max_batch_size = max_shape[0]  # max batch size
                 config: IBuilderConfig = builder.create_builder_config()
-                # config.min_timing_iterations = 1
-                # config.avg_timing_iterations = 1
                 config.max_workspace_size = workspace_size
                 # to enable complete trt inspector debugging, only for TensorRT >= 8.2
                 # config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
@@ -173,12 +171,8 @@ def build_engine(
                         max=max_shape,
                     )
                 config.add_optimization_profile(profile)
-                # for i in range(network.num_layers):
-                #     layer: ILayer = network.get_layer(i)
-                #     if "gemm" in str(layer.name).lower():
-                #         for g in range(layer.num_outputs):
-                #             layer.precision = trt.DataType.FLOAT
-                network_definition = fix_fp16_network(network_definition)
+                if fp16:
+                    network_definition = fix_fp16_network(network_definition)
                 trt_engine = builder.build_serialized_network(network_definition, config)
                 engine: ICudaEngine = runtime.deserialize_cuda_engine(trt_engine)
                 assert engine is not None, "error during engine generation, check error messages above :-("
@@ -202,16 +196,20 @@ def infer_tensorrt(
     output_binding_idxs: List[int],
     stream: Stream,
 ) -> np.ndarray:
-    # warning: small change in output if int64 is used instead of int32
-    input_list: List[ndarray] = [tensor.astype(np.int32) for tensor in host_inputs.values()]
-    # allocate GPU memory for input tensors
-    device_inputs = [cuda.mem_alloc(tensor.nbytes) for tensor in input_list]
-    for h_input, d_input in zip(input_list, device_inputs):
-        cuda.memcpy_htod_async(d_input, h_input)  # host to GPU
+    input_list: List[ndarray] = list()
+    device_inputs: List[DeviceAllocation] = list()
+    for tensor in host_inputs.values():
+        # warning: small change in output if int64 is used instead of int32
+        tensor_int32: np.ndarray = np.asarray(tensor, dtype=np.int32)
+        input_list.append(tensor_int32)
+        # allocate GPU memory for input tensors
+        device_input: DeviceAllocation = cuda.mem_alloc(tensor_int32.nbytes)
+        device_inputs.append(device_input)
+        cuda.memcpy_htod_async(device_input, tensor_int32.ravel(), stream)
     # calculate input shape, bind it, allocate GPU memory for the output
     host_outputs, device_outputs = setup_binding_shapes(context, input_list, input_binding_idxs, output_binding_idxs)
     bindings = device_inputs + device_outputs
-    context.execute_async_v2(bindings, stream.handle)
+    assert context.execute_async_v2(bindings, stream_handle=stream.handle), "failure during execute inference call"
     for h_output, d_output in zip(host_outputs, device_outputs):
         cuda.memcpy_dtoh_async(h_output, d_output)  # GPU to host
     stream.synchronize()  # sync all CUDA ops
diff --git a/src/transformer_deploy/convert.py b/src/transformer_deploy/convert.py
index e50687d4..e65c4057 100644
--- a/src/transformer_deploy/convert.py
+++ b/src/transformer_deploy/convert.py
@@ -245,22 +245,22 @@ def main():
             timings[benchmar_name] = time_buffer
         del model
 
+        conf = Configuration(
+            model_name=args.name,
+            model_type=ModelType.ONNX,
+            batch_size=0,
+            nb_output=output_pytorch.shape[1],
+            nb_instance=args.nb_instances,
+            include_token_type=include_token_ids,
+            workind_directory=args.output,
+        )
+        conf.create_folders(tokenizer=tokenizer, model_path=onnx_optim_fp16_path)
+
     print(f"Inference done on {get_device_name(0)}")
     print("latencies:")
     for name, time_buffer in timings.items():
         print_timings(name=name, timings=time_buffer)
 
-    conf = Configuration(
-        model_name=args.name,
-        model_type=ModelType.ONNX,
-        batch_size=0,
-        nb_output=output_pytorch.shape[1],
-        nb_instance=args.nb_instances,
-        include_token_type=include_token_ids,
-        workind_directory=args.output,
-    )
-    conf.create_folders(tokenizer=tokenizer, model_path=onnx_optim_fp16_path)
-
 
 if __name__ == "__main__":
     main()

From ac81a236da275985224bb8631bd4b018fe632acd Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 14:11:53 +0100
Subject: [PATCH 07/15] update quantization notebook

---
 README.md                                    |   2 +-
 demo/README.md                               |   2 +-
 demo/quantization_end_to_end.ipynb           | 826 +++++++++++--------
 src/transformer_deploy/backends/ort_utils.py |   8 +-
 src/transformer_deploy/convert.py            |   9 +-
 5 files changed, 512 insertions(+), 335 deletions(-)

diff --git a/README.md b/README.md
index e74842cd..c2dd4352 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ With the single command below, you will:
 * **generate** configuration files for Triton inference server
 
 ```shell
-convert_model -m roberta-large-mnli --backend tensorrt onnx pytorch --seq-len 16 128 128 --batch-size 1 32 32
+convert_model -m roberta-large-mnli --backend tensorrt onnx --seq-len 16 128 128 --batch-size 1 32 32
 ```
 
 > **16 128 128** -> minimum, optimal, maximum sequence length, to help TensorRT better optimize your model  
diff --git a/demo/README.md b/demo/README.md
index 2a606e17..b97ae13d 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -40,7 +40,7 @@ docker run -it --rm --gpus all \
   -v $PWD:/project ghcr.io/els-rd/transformer-deploy:0.1.1 \
   bash -c "cd /project && \
     convert_model -m \"philschmid/MiniLM-L6-H384-uncased-sst2\" \
-    --backend tensorrt onnx pytorch \
+    --backend tensorrt onnx \
     --seq-len 16 128 128"
 ```
 
diff --git a/demo/quantization_end_to_end.ipynb b/demo/quantization_end_to_end.ipynb
index 2c16c604..9793151d 100644
--- a/demo/quantization_end_to_end.ipynb
+++ b/demo/quantization_end_to_end.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Recipes to perform Nvidia GPU INT-8 quantization on most transformers model"
+    "# Recipes to perform Nvidia GPU INT-8 quantization on most transformers model (encoder based)"
    ]
   },
   {
@@ -21,16 +21,21 @@
     "\n",
     "The purpose of this tutorial is to show 2 processes to perform quantization on most `transformer` architecture.\n",
     "\n",
-    "**TL;DR, inference is 5 times faster on a `Roberta-base` model** with a batch of size 32 / seq len 384, benchmark (bold, quantization):\n",
+    "**TL;DR, inference is 5 times faster on a `Roberta-base` model** with a batch of size 32 / seq len 256, benchmark on MNLI datasets (bold -> **quantization**):\n",
     "\n",
-    "| Framework                  | Precision | Latency (ms) | Accuracy | Speedup |\n",
-    "| -------------------------- | --------- | ------------ | -------- | ------- |\n",
-    "| Pytorch                    | FP32      | 76.31        | ---      | X 1     |\n",
-    "| Pytorch                    | FP16      | 56.04        | ---      | X 1.4   |\n",
-    "| TensorRT                   | FP16      | 30.17        | ---      | X 2.5   |\n",
-    "| TensorRT (transplantation) | **INT-8** | 15.83        | ---      | **X 5** |\n",
-    "| TensorRT (custom QDQ code) | **INT-8** | 14.94        | ---      | **X 5** |\n",
+    "| Framework                  | Precision | Latency (ms) | Accuracy | Speedup   | Hardware |\n",
+    "|:---------------------------|-----------|--------------|----------|:----------|:--------:|\n",
+    "| Pytorch                    | FP32      | 4407         | 86.8 %   | X 0.02    | CPU      |\n",
+    "| Pytorch                    | FP16      | 4255         | 86.8 %   | X 0.02    | CPU      |\n",
+    "| Pytorch                    | FP32      | 77           | 86.8 %   | X 1       | GPU      |\n",
+    "| Pytorch                    | FP16      | 58           | 86.8 %   | X 1.3     | GPU      |\n",
+    "| TensorRT                   | FP16      | 30           | 86.8 %   | X 2.6     | GPU      |\n",
+    "| TensorRT (transplantation) | **INT-8** | 15           | 84.8 %   | **X 5.1** | GPU      |\n",
+    "| TensorRT (custom QDQ code) | **INT-8** | 15           | 85.6 %   | **X 5.1** | GPU      |\n",
     "\n",
+    "> measures done on a Nvidia RTX 3090 GPU + 12 cores i7 Intel CPU\n",
+    "> accuracy obtained after a single epoch, no LR search or any hyper parameter optimization\n",
+    "> CPU measures are unfair but still indicative of what kind of perf to expect from Pytorch+CPU deployment\n",
     "> same kind of acceleration is observed on all seq len / batch sizes\n",
     "\n",
     "\n",
@@ -133,7 +138,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Tue Dec  7 21:16:44 2021       \r\n",
+      "Wed Dec  8 07:41:28 2021       \r\n",
       "+-----------------------------------------------------------------------------+\r\n",
       "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\r\n",
       "|-------------------------------+----------------------+----------------------+\r\n",
@@ -142,7 +147,7 @@
       "|                               |                      |               MIG M. |\r\n",
       "|===============================+======================+======================|\r\n",
       "|   0  NVIDIA GeForce ...  On   | 00000000:03:00.0  On |                  N/A |\r\n",
-      "| 35%   42C    P8    40W / 350W |    221MiB / 24267MiB |      0%      Default |\r\n",
+      "| 30%   40C    P8    37W / 350W |    499MiB / 24267MiB |      0%      Default |\r\n",
       "|                               |                      |                  N/A |\r\n",
       "+-------------------------------+----------------------+----------------------+\r\n",
       "                                                                               \r\n",
@@ -151,10 +156,11 @@
       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\r\n",
       "|        ID   ID                                                   Usage      |\r\n",
       "|=============================================================================|\r\n",
-      "|    0   N/A  N/A      1903      G   /usr/lib/xorg/Xorg                124MiB |\r\n",
-      "|    0   N/A  N/A      7277      G   /usr/bin/gnome-shell               38MiB |\r\n",
-      "|    0   N/A  N/A     58986      G   ..._57461.log --shared-files       19MiB |\r\n",
-      "|    0   N/A  N/A     63844      G   ...AAAAAAAAA= --shared-files       35MiB |\r\n",
+      "|    0   N/A  N/A      1632      G   /usr/lib/xorg/Xorg                119MiB |\r\n",
+      "|    0   N/A  N/A      7547      G   /usr/bin/gnome-shell               37MiB |\r\n",
+      "|    0   N/A  N/A     23797      G   ..._12759.log --shared-files       16MiB |\r\n",
+      "|    0   N/A  N/A     23894      G   ...AAAAAAAAA= --shared-files       69MiB |\r\n",
+      "|    0   N/A  N/A    291688      C   ...st_transformer/bin/python      251MiB |\r\n",
       "+-----------------------------------------------------------------------------+\r\n"
      ]
     }
@@ -202,8 +208,9 @@
    "source": [
     "import numpy as np\n",
     "from tqdm.notebook import tqdm\n",
-    "\n",
-    "from typing import OrderedDict as OD, List, Dict\n",
+    "import transformers\n",
+    "import datasets\n",
+    "from typing import OrderedDict as OD, List, Dict, Union\n",
     "import torch\n",
     "from torch import Tensor\n",
     "from transformers import (\n",
@@ -214,31 +221,20 @@
     "    TrainingArguments,\n",
     "    Trainer,\n",
     "    IntervalStrategy,\n",
+    "    AutoTokenizer,\n",
+    "    PreTrainedTokenizer,\n",
     ")\n",
+    "from datasets import load_dataset, load_metric\n",
     "from transformer_deploy.QDQModels.QDQRoberta import QDQRobertaForSequenceClassification\n",
     "import pytorch_quantization.nn as quant_nn\n",
     "from pytorch_quantization.tensor_quant import QuantDescriptor\n",
     "from pytorch_quantization import calib\n",
     "import logging\n",
-    "import transformers\n",
-    "import datasets\n",
     "from datasets import DatasetDict\n",
     "from transformer_deploy.backends.trt_utils import build_engine, get_binding_idxs, infer_tensorrt, load_engine\n",
     "from transformer_deploy.backends.ort_utils import convert_to_onnx\n",
     "from collections import OrderedDict\n",
-    "from transformer_deploy.benchmarks.utils import track_infer_time, print_timings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
+    "from transformer_deploy.benchmarks.utils import track_infer_time, print_timings\n",
     "from pycuda._driver import Stream\n",
     "import tensorrt as trt\n",
     "from tensorrt.tensorrt import IExecutionContext, Logger, Runtime\n",
@@ -254,7 +250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -286,7 +282,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {
     "id": "zVvslsfMIrIh"
    },
@@ -307,12 +303,12 @@
     "id": "W7QYTpxXIrIl"
    },
    "source": [
-    "We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download the data and get the metric we need to use for evaluation (to compare our model to the benchmark). This can be easily done with the functions `load_dataset` and `load_metric`."
+    "We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download the data and get the metric we need to use for evaluation (to compare our model to the benchmark)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {
     "id": "IreSlFmlIrIm"
    },
@@ -320,7 +316,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e6b052cb482a4df89d092e40bca792b3",
+       "model_id": "157435bd8610413f83c3bf7bdff3fb5d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -358,14 +354,12 @@
        "})"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from datasets import load_dataset, load_metric\n",
-    "\n",
     "dataset = load_dataset(\"glue\", task)\n",
     "metric = load_metric(\"glue\", task)\n",
     "dataset"
@@ -398,7 +392,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -454,8 +448,6 @@
    },
    "outputs": [],
    "source": [
-    "from transformers import AutoTokenizer, PreTrainedTokenizer\n",
-    "\n",
     "tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
    ]
   },
@@ -465,12 +457,12 @@
     "id": "2C0hcmp9IrJQ"
    },
    "source": [
-    "We can them write the function that will preprocess our samples. We just feed them to the `tokenizer` with the argument `truncation=True`. This will ensure that an input longer that what the model selected can handle will be truncated to the maximum length accepted by the model."
+    "We can them write the function that will preprocess our samples. We just feed them to the `tokenizer` with the argument `truncation=True` and `padding=\"max_length\"`. This will ensure that all sequences have the same size."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {
     "id": "vc0BSBLIIrJQ"
    },
@@ -484,7 +476,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -546,7 +538,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 10,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -554,9 +546,6 @@
    },
    "outputs": [],
    "source": [
-    "from typing import Union\n",
-    "\n",
-    "\n",
     "def compute_metrics(eval_pred):\n",
     "    predictions, labels = eval_pred\n",
     "    if task != \"stsb\":\n",
@@ -620,45 +609,36 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 12,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "runtime: Runtime = trt.Runtime(trt_logger)\n",
-    "profile_index = 0"
+    "Some `TensorRT` reused variables:"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Fine-tuning model"
+    "runtime: Runtime = trt.Runtime(trt_logger)\n",
+    "profile_index = 0"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Fine-tuning model\n",
+    "\n",
     "Now that our data are ready, we can download the pretrained model and fine-tune it.\n",
     "\n",
-    "We will also prepare some export function right now"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
-   "source": [
     "Default parameters to be used for the training:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -714,7 +694,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -756,7 +736,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -790,29 +770,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[INFO|trainer.py:437] 2021-12-07 16:55:07,169 >> Using amp half precision backend\n"
+      "[INFO|trainer.py:437] 2021-12-08 07:41:50,834 >> Using amp half precision backend\n"
      ]
     },
     {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
-      "\u001B[0;32m/tmp/ipykernel_1083276/2185767024.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m     12\u001B[0m )\n\u001B[1;32m     13\u001B[0m \u001B[0mtransformers\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlogging\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_verbosity_error\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 14\u001B[0;31m \u001B[0mtrainer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrain\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     15\u001B[0m \u001B[0mprint\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mevaluate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     16\u001B[0m \u001B[0mmodel_bert\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msave_pretrained\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m\"roberta-in-bert-trained\"\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/transformers/trainer.py\u001B[0m in \u001B[0;36mtrain\u001B[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001B[0m\n\u001B[1;32m   1321\u001B[0m                         \u001B[0mtr_loss_step\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1322\u001B[0m                 \u001B[0;32melse\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1323\u001B[0;31m                     \u001B[0mtr_loss_step\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m   1324\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1325\u001B[0m                 if (\n",
-      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/transformers/trainer.py\u001B[0m in \u001B[0;36mtraining_step\u001B[0;34m(self, model, inputs)\u001B[0m\n\u001B[1;32m   1875\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1876\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdo_grad_scaling\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1877\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscaler\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscale\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mloss\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mbackward\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m   1878\u001B[0m         \u001B[0;32melif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0muse_apex\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1879\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mamp\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscale_loss\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mloss\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0moptimizer\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mscaled_loss\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/torch/_tensor.py\u001B[0m in \u001B[0;36mbackward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m    305\u001B[0m                 \u001B[0mcreate_graph\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    306\u001B[0m                 inputs=inputs)\n\u001B[0;32m--> 307\u001B[0;31m         \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mautograd\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mbackward\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgradient\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mretain_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    308\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    309\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mregister_hook\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mhook\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/torch/autograd/__init__.py\u001B[0m in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m    152\u001B[0m         \u001B[0mretain_graph\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 154\u001B[0;31m     Variable._execution_engine.run_backward(\n\u001B[0m\u001B[1;32m    155\u001B[0m         \u001B[0mtensors\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgrad_tensors_\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mretain_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    156\u001B[0m         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag\n",
-      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.7658, 'learning_rate': 9.1875814863103e-06, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.5338948369026184, 'eval_accuracy': 0.7948038716250637, 'eval_runtime': 18.3625, 'eval_samples_per_second': 534.514, 'eval_steps_per_second': 8.387, 'epoch': 0.08}\n",
+      "{'loss': 0.5566, 'learning_rate': 8.372718383311604e-06, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.4757803678512573, 'eval_accuracy': 0.8167091186958737, 'eval_runtime': 18.39, 'eval_samples_per_second': 533.713, 'eval_steps_per_second': 8.374, 'epoch': 0.16}\n",
+      "{'loss': 0.5135, 'learning_rate': 7.557855280312908e-06, 'epoch': 0.24}\n",
+      "{'eval_loss': 0.46861791610717773, 'eval_accuracy': 0.8164034640855833, 'eval_runtime': 18.4131, 'eval_samples_per_second': 533.044, 'eval_steps_per_second': 8.364, 'epoch': 0.24}\n",
+      "{'loss': 0.4868, 'learning_rate': 6.743807040417211e-06, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.4253948926925659, 'eval_accuracy': 0.8351502801833928, 'eval_runtime': 18.4305, 'eval_samples_per_second': 532.543, 'eval_steps_per_second': 8.356, 'epoch': 0.33}\n",
+      "{'loss': 0.4669, 'learning_rate': 5.9289439374185145e-06, 'epoch': 0.41}\n",
+      "{'eval_loss': 0.4190593957901001, 'eval_accuracy': 0.8383087111563933, 'eval_runtime': 18.4268, 'eval_samples_per_second': 532.649, 'eval_steps_per_second': 8.357, 'epoch': 0.41}\n",
+      "{'loss': 0.4544, 'learning_rate': 5.114080834419818e-06, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.4306202828884125, 'eval_accuracy': 0.8335201222618441, 'eval_runtime': 18.4565, 'eval_samples_per_second': 531.792, 'eval_steps_per_second': 8.344, 'epoch': 0.49}\n",
+      "{'loss': 0.4542, 'learning_rate': 4.30003259452412e-06, 'epoch': 0.57}\n",
+      "{'eval_loss': 0.40120720863342285, 'eval_accuracy': 0.844727457972491, 'eval_runtime': 18.4367, 'eval_samples_per_second': 532.362, 'eval_steps_per_second': 8.353, 'epoch': 0.57}\n",
+      "{'loss': 0.4427, 'learning_rate': 3.4851694915254244e-06, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.3936639130115509, 'eval_accuracy': 0.8454406520631687, 'eval_runtime': 18.4345, 'eval_samples_per_second': 532.425, 'eval_steps_per_second': 8.354, 'epoch': 0.65}\n",
+      "{'loss': 0.4369, 'learning_rate': 2.670306388526728e-06, 'epoch': 0.73}\n",
+      "{'eval_loss': 0.3961443305015564, 'eval_accuracy': 0.8489047376464595, 'eval_runtime': 18.4534, 'eval_samples_per_second': 531.879, 'eval_steps_per_second': 8.345, 'epoch': 0.73}\n",
+      "{'loss': 0.4257, 'learning_rate': 1.8554432855280313e-06, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.39044129848480225, 'eval_accuracy': 0.8509424350483953, 'eval_runtime': 18.4536, 'eval_samples_per_second': 531.876, 'eval_steps_per_second': 8.345, 'epoch': 0.81}\n",
+      "{'loss': 0.4285, 'learning_rate': 1.0413950456323338e-06, 'epoch': 0.9}\n",
+      "{'eval_loss': 0.38357552886009216, 'eval_accuracy': 0.8525725929699439, 'eval_runtime': 18.4857, 'eval_samples_per_second': 530.952, 'eval_steps_per_second': 8.331, 'epoch': 0.9}\n",
+      "{'loss': 0.4278, 'learning_rate': 2.265319426336376e-07, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.3847087025642395, 'eval_accuracy': 0.8522669383596536, 'eval_runtime': 18.4593, 'eval_samples_per_second': 531.711, 'eval_steps_per_second': 8.343, 'epoch': 0.98}\n",
+      "{'train_runtime': 2604.6513, 'train_samples_per_second': 150.77, 'train_steps_per_second': 4.712, 'train_loss': 0.48698368594730385, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.38357552886009216, 'eval_accuracy': 0.8525725929699439, 'eval_runtime': 18.4563, 'eval_samples_per_second': 531.796, 'eval_steps_per_second': 8.344, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.38357552886009216, 'eval_accuracy': 0.8525725929699439, 'eval_runtime': 18.4563, 'eval_samples_per_second': 531.796, 'eval_steps_per_second': 8.344, 'epoch': 1.0}\n"
      ]
     }
    ],
@@ -878,7 +876,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -905,14 +903,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "946664d7c0684b6e903745802c39fa17",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/4 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
+    "# keep it on CPU\n",
     "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained\", num_labels=num_labels)\n",
-    "model_q = calibrate(model=model_q, encoded_dataset=encoded_dataset)"
+    "model_q = calibrate(model=model_q, encoded_dataset=encoded_dataset)\n",
+    "model_q.save_pretrained(\"roberta-in-bert-trained-quantized\")"
    ]
   },
   {
@@ -921,16 +936,59 @@
    "source": [
     "### Quantization Aware Training (QAT)\n",
     "\n",
-    "The query aware training is not a mandatory step, but **highly** recommended to get the best accuracy. Basically we will redo the training with the quantization enabled and a low learning rate."
+    "The query aware training is not a mandatory step, but **highly** recommended to get the best accuracy. Basically we will redo the training with the quantization enabled and a low learning rate to avoid overfitting."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
    "metadata": {
     "id": "imY1oC3SIrJf"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[INFO|trainer.py:437] 2021-12-08 08:48:40,176 >> Using amp half precision backend\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.43096092343330383, 'eval_accuracy': 0.8348446255731024, 'eval_runtime': 46.2449, 'eval_samples_per_second': 212.24, 'eval_steps_per_second': 3.33}\n",
+      "{'eval_loss': 0.43096092343330383, 'eval_accuracy': 0.8348446255731024, 'eval_runtime': 46.2449, 'eval_samples_per_second': 212.24, 'eval_steps_per_second': 3.33}\n",
+      "{'loss': 0.4542, 'learning_rate': 9.187581486310299e-07, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.4320202171802521, 'eval_accuracy': 0.8392256749872644, 'eval_runtime': 47.5223, 'eval_samples_per_second': 206.535, 'eval_steps_per_second': 3.241, 'epoch': 0.08}\n",
+      "{'loss': 0.4439, 'learning_rate': 8.372718383311604e-07, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.4244120717048645, 'eval_accuracy': 0.8415690269994905, 'eval_runtime': 46.9517, 'eval_samples_per_second': 209.045, 'eval_steps_per_second': 3.28, 'epoch': 0.16}\n",
+      "{'loss': 0.4323, 'learning_rate': 7.557855280312907e-07, 'epoch': 0.24}\n",
+      "{'eval_loss': 0.4180322289466858, 'eval_accuracy': 0.8435048395313296, 'eval_runtime': 46.8629, 'eval_samples_per_second': 209.441, 'eval_steps_per_second': 3.286, 'epoch': 0.24}\n",
+      "{'loss': 0.4254, 'learning_rate': 6.74380704041721e-07, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.42280977964401245, 'eval_accuracy': 0.8436067244014264, 'eval_runtime': 46.8872, 'eval_samples_per_second': 209.332, 'eval_steps_per_second': 3.284, 'epoch': 0.33}\n",
+      "{'loss': 0.4285, 'learning_rate': 5.928943937418513e-07, 'epoch': 0.41}\n",
+      "{'eval_loss': 0.416576623916626, 'eval_accuracy': 0.8393275598573612, 'eval_runtime': 48.0341, 'eval_samples_per_second': 204.334, 'eval_steps_per_second': 3.206, 'epoch': 0.41}\n",
+      "{'loss': 0.427, 'learning_rate': 5.114080834419818e-07, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.41878825426101685, 'eval_accuracy': 0.8414671421293938, 'eval_runtime': 48.3193, 'eval_samples_per_second': 203.128, 'eval_steps_per_second': 3.187, 'epoch': 0.49}\n",
+      "{'loss': 0.4207, 'learning_rate': 4.2992177314211206e-07, 'epoch': 0.57}\n",
+      "{'eval_loss': 0.42357301712036133, 'eval_accuracy': 0.8398369842078451, 'eval_runtime': 48.6821, 'eval_samples_per_second': 201.614, 'eval_steps_per_second': 3.163, 'epoch': 0.57}\n",
+      "{'loss': 0.425, 'learning_rate': 3.4859843546284223e-07, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.41158831119537354, 'eval_accuracy': 0.8456444218033622, 'eval_runtime': 48.0513, 'eval_samples_per_second': 204.261, 'eval_steps_per_second': 3.205, 'epoch': 0.65}\n",
+      "{'loss': 0.4283, 'learning_rate': 2.6711212516297263e-07, 'epoch': 0.73}\n",
+      "{'eval_loss': 0.40967991948127747, 'eval_accuracy': 0.8455425369332654, 'eval_runtime': 47.2486, 'eval_samples_per_second': 207.731, 'eval_steps_per_second': 3.259, 'epoch': 0.73}\n",
+      "{'loss': 0.4162, 'learning_rate': 1.85625814863103e-07, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.417491614818573, 'eval_accuracy': 0.844319918492104, 'eval_runtime': 46.8968, 'eval_samples_per_second': 209.289, 'eval_steps_per_second': 3.284, 'epoch': 0.81}\n",
+      "{'loss': 0.4179, 'learning_rate': 1.0422099087353324e-07, 'epoch': 0.9}\n",
+      "{'eval_loss': 0.4117409586906433, 'eval_accuracy': 0.8449312277126847, 'eval_runtime': 50.0029, 'eval_samples_per_second': 196.289, 'eval_steps_per_second': 3.08, 'epoch': 0.9}\n",
+      "{'loss': 0.4201, 'learning_rate': 2.2734680573663624e-08, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.4105292558670044, 'eval_accuracy': 0.8482934284258787, 'eval_runtime': 49.9141, 'eval_samples_per_second': 196.638, 'eval_steps_per_second': 3.085, 'epoch': 0.98}\n",
+      "{'train_runtime': 5124.3924, 'train_samples_per_second': 76.634, 'train_steps_per_second': 2.395, 'train_loss': 0.4281486333426783, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.4105292558670044, 'eval_accuracy': 0.8482934284258787, 'eval_runtime': 51.6384, 'eval_samples_per_second': 190.072, 'eval_steps_per_second': 2.982, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.4105292558670044, 'eval_accuracy': 0.8482934284258787, 'eval_runtime': 51.6384, 'eval_samples_per_second': 190.072, 'eval_steps_per_second': 2.982, 'epoch': 1.0}\n"
+     ]
+    }
+   ],
    "source": [
     "model_q = QDQBertForSequenceClassification.from_pretrained(\"roberta-in-bert-trained-quantized\", num_labels=num_labels)\n",
     "model_q = model_q.cuda()\n",
@@ -969,7 +1027,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -983,7 +1041,7 @@
     "from pytorch_quantization.nn import TensorQuantizer\n",
     "\n",
     "TensorQuantizer.use_fb_fake_quant = True\n",
-    "convert_to_onnx(model_q, output_path=\"model_q.onnx\", inputs_pytorch=input_torch)\n",
+    "convert_to_onnx(model_q, output_path=\"model_q.onnx\", inputs_pytorch=input_torch, opset=13)\n",
     "TensorQuantizer.use_fb_fake_quant = False\n",
     "# del model_q"
    ]
@@ -1001,7 +1059,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 25,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1024,7 +1082,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 26,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1050,7 +1108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 27,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1066,7 +1124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 28,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1075,7 +1133,6 @@
    "outputs": [],
    "source": [
     "data = encoded_dataset[\"train\"][0:batch_size]\n",
-    "input_torch: Dict[str, torch.Tensor] = convert_tensor(data, output=\"torch\")\n",
     "input_np: Dict[str, np.ndarray] = convert_tensor(data, output=\"np\")"
    ]
   },
@@ -1099,7 +1156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 29,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1110,38 +1167,38 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[array([[ 5.6351620e-01,  1.4767665e+00, -2.0787194e+00],\n",
-      "       [ 2.3301950e+00, -1.0177574e+00, -1.3668290e+00],\n",
-      "       [ 1.5580183e+00, -5.9583592e-01, -1.1548299e+00],\n",
-      "       [ 1.9603873e+00, -2.0616996e-01, -1.8071964e+00],\n",
-      "       [ 2.4737215e+00, -2.8350648e-01, -1.8850105e+00],\n",
-      "       [ 3.6134090e+00, -1.3006018e-01, -2.7839746e+00],\n",
-      "       [-3.1495490e+00,  2.4353392e+00,  1.9919875e-01],\n",
-      "       [ 3.3117905e+00, -7.3505348e-01, -2.0925450e+00],\n",
-      "       [ 3.2750502e-01, -1.5198725e+00,  1.2251633e+00],\n",
-      "       [-3.7192254e+00, -5.1082242e-01,  3.6361742e+00],\n",
-      "       [ 3.1723669e+00, -6.5267378e-01, -2.1629393e+00],\n",
-      "       [-1.5052840e+00, -1.1153723e+00,  2.1314652e+00],\n",
-      "       [-2.7875674e+00,  3.3702278e+00, -9.6062738e-01],\n",
-      "       [-2.1700766e+00,  2.1553783e+00, -4.1763881e-01],\n",
-      "       [-1.2523253e-01, -9.4394463e-01,  8.0471390e-01],\n",
-      "       [ 2.3903012e+00, -1.0954552e+00, -1.0219078e+00],\n",
-      "       [ 3.7135108e+00, -6.1678243e-01, -2.5324042e+00],\n",
-      "       [-2.8983197e+00, -1.9243273e+00,  4.2502666e+00],\n",
-      "       [-3.1470397e+00, -1.6737628e+00,  4.2269526e+00],\n",
-      "       [-3.1141593e+00,  3.4683597e+00, -7.6941836e-01],\n",
-      "       [ 3.8057449e+00, -6.6588068e-01, -2.4926093e+00],\n",
-      "       [ 2.6230648e+00,  2.3657779e-01, -2.3784602e+00],\n",
-      "       [-2.1757143e+00,  3.6484423e-01,  1.2388697e+00],\n",
-      "       [ 3.7942352e+00, -4.8870793e-01, -2.6957376e+00],\n",
-      "       [ 3.6744323e+00, -1.3134056e+00, -1.7758287e+00],\n",
-      "       [-1.1789101e+00,  1.9029677e-01,  4.4781533e-01],\n",
-      "       [-2.7335472e+00,  1.4046015e+00,  8.6339402e-01],\n",
-      "       [-1.3156077e+00,  1.9026613e+00, -7.8255135e-01],\n",
-      "       [-3.3838544e+00, -6.8161070e-01,  3.4489069e+00],\n",
-      "       [ 2.7053127e+00, -4.8565903e-01, -2.0700452e+00],\n",
-      "       [ 2.7803206e+00, -4.0302199e-01, -2.2101507e+00],\n",
-      "       [ 3.1589518e+00, -1.0739815e-03, -2.7553422e+00]], dtype=float32)]\n"
+      "[array([[ 0.34206298,  1.5652132 , -2.3528326 ],\n",
+      "       [ 2.5013878 , -0.81571996, -1.6251811 ],\n",
+      "       [ 1.8918471 , -0.76798105, -1.0148249 ],\n",
+      "       [ 2.0562491 , -0.22451262, -1.8686965 ],\n",
+      "       [ 2.586117  , -0.09310705, -2.4128742 ],\n",
+      "       [ 3.1871881 , -0.38016185, -2.5407064 ],\n",
+      "       [-3.4681158 ,  2.25822   ,  0.37315404],\n",
+      "       [ 3.5095093 , -0.8846639 , -2.5989952 ],\n",
+      "       [-0.17400724, -1.6495969 ,  1.7838944 ],\n",
+      "       [-2.966234  , -1.4364657 ,  4.0166936 ],\n",
+      "       [ 3.275045  , -0.9761375 , -2.1260378 ],\n",
+      "       [-1.35331   , -0.42718923,  1.3907498 ],\n",
+      "       [-2.6201942 ,  2.9925148 , -1.0296444 ],\n",
+      "       [-2.8947299 ,  2.072019  ,  0.1730565 ],\n",
+      "       [ 0.10867599, -0.7385151 ,  0.35388532],\n",
+      "       [ 3.0392425 , -0.94136757, -1.9179116 ],\n",
+      "       [ 3.5692515 , -0.6002568 , -2.7545912 ],\n",
+      "       [-2.6759057 , -1.738315  ,  4.1253285 ],\n",
+      "       [-3.2203894 , -1.2297541 ,  4.019567  ],\n",
+      "       [-2.4096491 ,  3.5356538 , -1.7411288 ],\n",
+      "       [ 3.8419678 , -0.9140588 , -2.8194869 ],\n",
+      "       [ 2.7242563 ,  0.10581933, -2.7189605 ],\n",
+      "       [-2.6767159 ,  0.0738265 ,  1.8019531 ],\n",
+      "       [ 3.4024699 , -0.23903687, -3.2066634 ],\n",
+      "       [ 3.2721906 , -1.4004866 , -1.7683858 ],\n",
+      "       [-1.3776261 ,  0.23932378,  0.65892386],\n",
+      "       [-2.2985775 ,  1.4366189 ,  0.42702717],\n",
+      "       [-2.0242352 ,  2.6943915 , -1.1765195 ],\n",
+      "       [-3.738225  , -0.40719697,  3.6082602 ],\n",
+      "       [ 3.3571942 , -0.5865445 , -2.7262824 ],\n",
+      "       [ 2.5306373 , -0.16031216, -2.4750497 ],\n",
+      "       [ 2.9033797 ,  0.02746576, -2.9880157 ]], dtype=float32)]\n"
      ]
     }
    ],
@@ -1165,14 +1222,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[TensorRT (FP16)] mean=15.83ms, sd=1.68ms, min=14.22ms, max=25.70ms, median=15.09ms, 95p=17.72ms, 99p=19.29ms\n"
+      "[TensorRT (INT-8)] mean=15.42ms, sd=1.35ms, min=14.16ms, max=18.86ms, median=14.58ms, 95p=17.79ms, 99p=18.25ms\n"
      ]
     }
    ],
@@ -1218,7 +1275,7 @@
     "* replace linear layers with their quantized version\n",
     "* replace operations not supported out of the box by `TensorRT` by a similar code supporting the operation.\n",
     "\n",
-    "> it's not a complex process, but it requires some knowledge of both `ONNX` supported operations and `TensorRT` framework\n",
+    "> concrete examples on `Roberta` architecture: in HF library, there is a `cumsum` in the position embedding generation. Something very simple. It takes as input an integer tensor and output an integer tensor. It happens that the `cumsum` operator from TensorRT supports float but not integer (https://github.com/onnx/onnx-tensorrt/blob/master/docs/operators.md). It leads to a crash during the model conversion with a strange error message. Converting the input to float tensor fix the issue. Not complex, but requires some knowledge.\n",
     "\n",
     "The process below is a bit simpler than the method 1:\n",
     "\n",
@@ -1230,39 +1287,6 @@
     "### Fine tuning the model"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(\n",
-    "#     model_checkpoint, num_labels=num_labels\n",
-    "# )\n",
-    "# model_roberta = model_roberta.cuda()\n",
-    "\n",
-    "# args.learning_rate = 1e-5\n",
-    "# trainer = Trainer(\n",
-    "#     model_roberta,\n",
-    "#     args,\n",
-    "#     train_dataset=encoded_dataset[\"train\"],\n",
-    "#     eval_dataset=encoded_dataset[validation_key],\n",
-    "#     tokenizer=tokenizer,\n",
-    "#     compute_metrics=compute_metrics,\n",
-    "# )\n",
-    "# transformers.logging.set_verbosity_error()\n",
-    "# trainer.train()\n",
-    "# print(trainer.evaluate())\n",
-    "# # {'eval_loss': 0.3559744358062744, 'eval_accuracy': 0.8655119714722364, 'eval_runtime': 19.6678, 'eval_samples_per_second': 499.04, 'eval_steps_per_second': 7.83, 'epoch': 0.98}\n",
-    "# trainer.save_model(\"roberta-model\")\n",
-    "# del model_roberta\n",
-    "# del trainer"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1276,7 +1300,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 32,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1286,21 +1310,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2614c39f73cb4a219791b89922f11c36",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "0it [00:00, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b43fc12b3f1d443a8a4079e8c7900ace",
+       "model_id": "7558e2e658444b8a8814a6c14ce3966a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1325,9 +1335,9 @@
     ")\n",
     "model_roberta.save_pretrained(\"roberta-untrained-quantized\")\n",
     "del model_roberta\n",
+    "\n",
     "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\"roberta-untrained-quantized\")\n",
     "model_roberta_q = calibrate(model=model_roberta_q, encoded_dataset=encoded_dataset)\n",
-    "\n",
     "model_roberta_q.save_pretrained(\"roberta-untrained-quantized\")\n",
     "del model_roberta_q"
    ]
@@ -1345,7 +1355,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 33,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1356,32 +1366,40 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[INFO|trainer.py:437] 2021-12-07 22:51:46,364 >> Using amp half precision backend\n"
+      "[INFO|trainer.py:437] 2021-12-08 11:40:25,911 >> Using amp half precision backend\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'loss': 0.7546, 'learning_rate': 9.1875814863103e-06, 'epoch': 0.08}\n",
-      "{'eval_loss': 0.5080597400665283, 'eval_accuracy': 0.8026490066225166, 'eval_runtime': 46.668, 'eval_samples_per_second': 210.315, 'eval_steps_per_second': 3.3, 'epoch': 0.08}\n",
-      "{'loss': 0.5466, 'learning_rate': 8.372718383311604e-06, 'epoch': 0.16}\n",
-      "{'eval_loss': 0.452316552400589, 'eval_accuracy': 0.8242485990830362, 'eval_runtime': 46.9082, 'eval_samples_per_second': 209.238, 'eval_steps_per_second': 3.283, 'epoch': 0.16}\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
-      "\u001B[0;32m/tmp/ipykernel_1192642/1887935502.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m     14\u001B[0m )\n\u001B[1;32m     15\u001B[0m \u001B[0mtransformers\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlogging\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_verbosity_error\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 16\u001B[0;31m \u001B[0mtrainer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrain\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     17\u001B[0m \u001B[0mprint\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mevaluate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     18\u001B[0m \u001B[0;31m# {'eval_loss': 0.40235549211502075, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.1754, 'eval_samples_per_second': 212.559, 'eval_steps_per_second': 3.335, 'epoch': 1.0}\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/transformers/trainer.py\u001B[0m in \u001B[0;36mtrain\u001B[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001B[0m\n\u001B[1;32m   1321\u001B[0m                         \u001B[0mtr_loss_step\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1322\u001B[0m                 \u001B[0;32melse\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1323\u001B[0;31m                     \u001B[0mtr_loss_step\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m   1324\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1325\u001B[0m                 if (\n",
-      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/transformers/trainer.py\u001B[0m in \u001B[0;36mtraining_step\u001B[0;34m(self, model, inputs)\u001B[0m\n\u001B[1;32m   1875\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1876\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdo_grad_scaling\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1877\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscaler\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscale\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mloss\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mbackward\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m   1878\u001B[0m         \u001B[0;32melif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0muse_apex\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1879\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mamp\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mscale_loss\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mloss\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0moptimizer\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mscaled_loss\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/torch/_tensor.py\u001B[0m in \u001B[0;36mbackward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m    305\u001B[0m                 \u001B[0mcreate_graph\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    306\u001B[0m                 inputs=inputs)\n\u001B[0;32m--> 307\u001B[0;31m         \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mautograd\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mbackward\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgradient\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mretain_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0minputs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    308\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    309\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mregister_hook\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mhook\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/.local/share/virtualenvs/fast_transformer/lib/python3.9/site-packages/torch/autograd/__init__.py\u001B[0m in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m    152\u001B[0m         \u001B[0mretain_graph\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 154\u001B[0;31m     Variable._execution_engine.run_backward(\n\u001B[0m\u001B[1;32m    155\u001B[0m         \u001B[0mtensors\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgrad_tensors_\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mretain_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mcreate_graph\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minputs\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    156\u001B[0m         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag\n",
-      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
+      "{'loss': 0.7745, 'learning_rate': 9.1875814863103e-06, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.5123801827430725, 'eval_accuracy': 0.8002037697401936, 'eval_runtime': 46.8364, 'eval_samples_per_second': 209.559, 'eval_steps_per_second': 3.288, 'epoch': 0.08}\n",
+      "{'loss': 0.5453, 'learning_rate': 8.372718383311604e-06, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.4548088014125824, 'eval_accuracy': 0.8248599083036169, 'eval_runtime': 50.0504, 'eval_samples_per_second': 196.102, 'eval_steps_per_second': 3.077, 'epoch': 0.16}\n",
+      "{'loss': 0.5076, 'learning_rate': 7.558670143415907e-06, 'epoch': 0.24}\n",
+      "{'eval_loss': 0.4582265615463257, 'eval_accuracy': 0.82190524707081, 'eval_runtime': 48.9017, 'eval_samples_per_second': 200.709, 'eval_steps_per_second': 3.149, 'epoch': 0.24}\n",
+      "{'loss': 0.4843, 'learning_rate': 6.743807040417211e-06, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.41166964173316956, 'eval_accuracy': 0.8402445236882323, 'eval_runtime': 47.7718, 'eval_samples_per_second': 205.456, 'eval_steps_per_second': 3.224, 'epoch': 0.33}\n",
+      "{'loss': 0.4668, 'learning_rate': 5.9289439374185145e-06, 'epoch': 0.41}\n",
+      "{'eval_loss': 0.4195743799209595, 'eval_accuracy': 0.8379011716760061, 'eval_runtime': 51.3011, 'eval_samples_per_second': 191.321, 'eval_steps_per_second': 3.002, 'epoch': 0.41}\n",
+      "{'loss': 0.4558, 'learning_rate': 5.114080834419818e-06, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.4104989171028137, 'eval_accuracy': 0.8442180336220071, 'eval_runtime': 48.8771, 'eval_samples_per_second': 200.81, 'eval_steps_per_second': 3.151, 'epoch': 0.49}\n",
+      "{'loss': 0.4504, 'learning_rate': 4.30003259452412e-06, 'epoch': 0.57}\n",
+      "{'eval_loss': 0.38803720474243164, 'eval_accuracy': 0.8504330106979113, 'eval_runtime': 49.1089, 'eval_samples_per_second': 199.862, 'eval_steps_per_second': 3.136, 'epoch': 0.57}\n",
+      "{'loss': 0.4401, 'learning_rate': 3.4851694915254244e-06, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.3891218900680542, 'eval_accuracy': 0.8535914416709118, 'eval_runtime': 49.191, 'eval_samples_per_second': 199.528, 'eval_steps_per_second': 3.131, 'epoch': 0.65}\n",
+      "{'loss': 0.4329, 'learning_rate': 2.670306388526728e-06, 'epoch': 0.73}\n",
+      "{'eval_loss': 0.3848048150539398, 'eval_accuracy': 0.8504330106979113, 'eval_runtime': 47.4583, 'eval_samples_per_second': 206.813, 'eval_steps_per_second': 3.245, 'epoch': 0.73}\n",
+      "{'loss': 0.423, 'learning_rate': 1.8554432855280313e-06, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.3859354257583618, 'eval_accuracy': 0.8538970962812023, 'eval_runtime': 47.4611, 'eval_samples_per_second': 206.801, 'eval_steps_per_second': 3.245, 'epoch': 0.81}\n",
+      "{'loss': 0.4266, 'learning_rate': 1.0413950456323338e-06, 'epoch': 0.9}\n",
+      "{'eval_loss': 0.3780878782272339, 'eval_accuracy': 0.8534895568008151, 'eval_runtime': 48.0721, 'eval_samples_per_second': 204.173, 'eval_steps_per_second': 3.204, 'epoch': 0.9}\n",
+      "{'loss': 0.4272, 'learning_rate': 2.265319426336376e-07, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.37839093804359436, 'eval_accuracy': 0.8561385634233316, 'eval_runtime': 49.087, 'eval_samples_per_second': 199.951, 'eval_steps_per_second': 3.137, 'epoch': 0.98}\n",
+      "{'train_runtime': 5220.9785, 'train_samples_per_second': 75.216, 'train_steps_per_second': 2.351, 'train_loss': 0.4849226055120396, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.37839093804359436, 'eval_accuracy': 0.8561385634233316, 'eval_runtime': 49.1273, 'eval_samples_per_second': 199.787, 'eval_steps_per_second': 3.135, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.37839093804359436, 'eval_accuracy': 0.8561385634233316, 'eval_runtime': 49.1273, 'eval_samples_per_second': 199.787, 'eval_steps_per_second': 3.135, 'epoch': 1.0}\n"
      ]
     }
    ],
@@ -1403,8 +1421,7 @@
     "transformers.logging.set_verbosity_error()\n",
     "trainer.train()\n",
     "print(trainer.evaluate())\n",
-    "# {'eval_loss': 0.40235549211502075, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.1754, 'eval_samples_per_second': 212.559, 'eval_steps_per_second': 3.335, 'epoch': 1.0}\n",
-    "model_roberta_q.save_pretrained(\"roberta-in-bert-trained-quantized\")\n",
+    "model_roberta_q.save_pretrained(\"roberta-trained-quantized\")\n",
     "del model_roberta_q"
    ]
   },
@@ -1428,7 +1445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 34,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1448,7 +1465,7 @@
    ],
    "source": [
     "model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(\n",
-    "    \"roberta-in-bert-trained-quantized-retrained\", num_labels=num_labels\n",
+    "    \"roberta-trained-quantized\", num_labels=num_labels\n",
     ")\n",
     "model_roberta_q = model_roberta_q.cuda()\n",
     "\n",
@@ -1458,7 +1475,7 @@
     "from pytorch_quantization.nn import TensorQuantizer\n",
     "\n",
     "TensorQuantizer.use_fb_fake_quant = True\n",
-    "convert_to_onnx(model_pytorch=model_roberta_q, output_path=\"roberta_q.onnx\", inputs_pytorch=input_torch)\n",
+    "convert_to_onnx(model_pytorch=model_roberta_q, output_path=\"roberta_q.onnx\", inputs_pytorch=input_torch, opset=13)\n",
     "TensorQuantizer.use_fb_fake_quant = False"
    ]
   },
@@ -1471,7 +1488,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 35,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1494,7 +1511,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 36,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1516,7 +1533,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 37,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1532,7 +1549,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 38,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1556,7 +1573,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 39,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1567,38 +1584,38 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[array([[ 0.18246882,  1.1604499 , -1.9064299 ],\n",
-      "       [ 2.2959347 , -0.90284556, -1.7264905 ],\n",
-      "       [ 2.0187902 , -0.773502  , -1.5992272 ],\n",
-      "       [ 1.0055674 , -0.11009257, -1.5396124 ],\n",
-      "       [ 1.859256  , -0.2640643 , -1.9893125 ],\n",
-      "       [ 4.013083  , -1.0762665 , -2.6802182 ],\n",
-      "       [-3.8274715 ,  2.82228   ,  0.53441864],\n",
-      "       [ 3.9432845 , -1.111492  , -2.5691993 ],\n",
-      "       [-0.02036566, -1.372117  ,  0.7166275 ],\n",
-      "       [-3.4707005 , -1.0002614 ,  4.073874  ],\n",
-      "       [ 3.2287133 , -1.2782975 , -2.2006783 ],\n",
-      "       [-2.6840115 , -1.0188127 ,  3.1551964 ],\n",
-      "       [-3.7200396 ,  2.349886  ,  0.5967207 ],\n",
-      "       [-2.813374  ,  2.1468658 , -0.05996893],\n",
-      "       [-0.65687865, -0.6351316 ,  0.39051658],\n",
-      "       [ 2.4374492 , -1.0998904 , -1.6697674 ],\n",
-      "       [ 3.8417404 , -0.5648582 , -2.9105089 ],\n",
-      "       [-2.7713814 , -1.9245013 ,  4.5542636 ],\n",
-      "       [-3.3167183 , -1.1679829 ,  4.150669  ],\n",
-      "       [-2.3994412 ,  3.990843  , -1.8524642 ],\n",
-      "       [ 3.919186  , -0.6086215 , -3.0692425 ],\n",
-      "       [ 2.6670258 ,  0.80300426, -3.5866559 ],\n",
-      "       [-3.1351302 , -0.55978656,  3.1145272 ],\n",
-      "       [ 4.003843  , -0.3124757 , -3.3316247 ],\n",
-      "       [ 3.922188  , -1.144398  , -2.5969653 ],\n",
-      "       [-2.0846744 ,  0.20275442,  0.92436594],\n",
-      "       [-2.3019078 , -0.13688484,  1.519916  ],\n",
-      "       [-1.7067494 ,  3.948385  , -2.4575708 ],\n",
-      "       [-3.6868966 ,  0.2229948 ,  2.682487  ],\n",
-      "       [ 3.1668653 , -0.71280336, -2.5332575 ],\n",
-      "       [ 3.3461478 , -0.47501963, -2.9260926 ],\n",
-      "       [ 3.4306564 , -0.8531854 , -2.635625  ]], dtype=float32)]\n"
+      "[array([[ 0.00858257,  1.5917815 , -1.8337398 ],\n",
+      "       [ 2.432996  , -1.3068045 , -1.9821789 ],\n",
+      "       [ 1.1561737 , -0.86323494, -1.0034285 ],\n",
+      "       [ 1.5863879 , -0.49799222, -1.7219063 ],\n",
+      "       [ 1.7697937 , -0.11104879, -2.3511643 ],\n",
+      "       [ 3.5160832 , -1.3530374 , -3.0601408 ],\n",
+      "       [-3.4769394 ,  2.0265098 ,  1.874698  ],\n",
+      "       [ 3.3827643 , -1.2117878 , -2.8793433 ],\n",
+      "       [-0.17693216, -1.1394652 ,  0.9083401 ],\n",
+      "       [-2.8701797 , -0.7220555 ,  4.0437098 ],\n",
+      "       [ 3.2363806 , -1.5264729 , -2.39297   ],\n",
+      "       [-2.4144251 , -0.68517655,  3.2756474 ],\n",
+      "       [-2.5281413 ,  2.697305  , -0.10096363],\n",
+      "       [-2.4246836 ,  2.7231753 , -0.41800928],\n",
+      "       [ 0.01045033, -0.68109804,  0.3442644 ],\n",
+      "       [ 2.307869  , -1.3556942 , -1.7211589 ],\n",
+      "       [ 3.5693195 , -1.0019355 , -3.1455066 ],\n",
+      "       [-2.253701  , -1.5583014 ,  4.6081343 ],\n",
+      "       [-2.986448  , -0.8324479 ,  4.4171877 ],\n",
+      "       [-2.3470848 ,  3.5537364 , -1.2475395 ],\n",
+      "       [ 3.5942395 , -1.2296011 , -3.0068402 ],\n",
+      "       [ 3.0203044 , -0.39700866, -3.3843446 ],\n",
+      "       [-2.5756757 , -0.686817  ,  3.5764308 ],\n",
+      "       [ 3.411901  , -1.0631186 , -3.2706409 ],\n",
+      "       [ 3.393027  , -1.42746   , -2.8274863 ],\n",
+      "       [-0.67953676,  0.03448357,  0.46320617],\n",
+      "       [-2.6152198 ,  0.57314056,  2.398291  ],\n",
+      "       [-2.6590538 ,  3.3507993 , -0.73685795],\n",
+      "       [-2.5252337 ,  0.72088015,  2.060882  ],\n",
+      "       [ 2.9799984 , -0.9674468 , -2.915716  ],\n",
+      "       [ 2.9330335 , -1.4430482 , -2.2108274 ],\n",
+      "       [ 3.1044042 , -0.9246039 , -3.1474404 ]], dtype=float32)]\n"
      ]
     }
    ],
@@ -1622,14 +1639,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[TensorRT (FP16)] mean=14.94ms, sd=0.61ms, min=14.64ms, max=17.95ms, median=14.77ms, 95p=16.19ms, 99p=17.74ms\n"
+      "[TensorRT (INT-8)] mean=15.77ms, sd=0.58ms, min=14.85ms, max=17.66ms, median=15.81ms, 95p=16.61ms, 99p=17.50ms\n"
      ]
     }
    ],
@@ -1661,89 +1678,90 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### TensorRT baseline"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Below we export a randomly initialized `Roberta` model, the purpose is to only check the performance on mixed precision (FP16, no quantization)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "baseline_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
-    "baseline_model = baseline_model.cuda()\n",
-    "convert_to_onnx(baseline_model, output_path=\"baseline.onnx\", inputs_pytorch=input_torch)\n",
-    "del baseline_model"
+    "## Pytorch baseline\n",
+    "\n",
+    "### Finetuning"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 50,
    "metadata": {
-    "scrolled": true
+    "pycharm": {
+     "name": "#%%\n"
+    }
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[INFO|trainer.py:437] 2021-12-08 13:17:01,492 >> Using amp half precision backend\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[TensorRT (FP16)] mean=30.17ms, sd=0.46ms, min=29.53ms, max=32.12ms, median=29.97ms, 95p=31.24ms, 99p=31.58ms\n"
+      "{'loss': 0.65, 'learning_rate': 9.1875814863103e-06, 'epoch': 0.08}\n",
+      "{'eval_loss': 0.4644513428211212, 'eval_accuracy': 0.8212939378502292, 'eval_runtime': 18.8325, 'eval_samples_per_second': 521.174, 'eval_steps_per_second': 8.177, 'epoch': 0.08}\n",
+      "{'loss': 0.4912, 'learning_rate': 8.372718383311604e-06, 'epoch': 0.16}\n",
+      "{'eval_loss': 0.4196386933326721, 'eval_accuracy': 0.8379011716760061, 'eval_runtime': 19.1574, 'eval_samples_per_second': 512.335, 'eval_steps_per_second': 8.039, 'epoch': 0.16}\n",
+      "{'loss': 0.4631, 'learning_rate': 7.558670143415907e-06, 'epoch': 0.24}\n",
+      "{'eval_loss': 0.42019498348236084, 'eval_accuracy': 0.8382068262862965, 'eval_runtime': 18.5971, 'eval_samples_per_second': 527.772, 'eval_steps_per_second': 8.281, 'epoch': 0.24}\n",
+      "{'loss': 0.4455, 'learning_rate': 6.743807040417211e-06, 'epoch': 0.33}\n",
+      "{'eval_loss': 0.3791417181491852, 'eval_accuracy': 0.8584819154355579, 'eval_runtime': 18.955, 'eval_samples_per_second': 517.804, 'eval_steps_per_second': 8.124, 'epoch': 0.33}\n",
+      "{'loss': 0.4264, 'learning_rate': 5.929758800521513e-06, 'epoch': 0.41}\n",
+      "{'eval_loss': 0.38219019770622253, 'eval_accuracy': 0.8525725929699439, 'eval_runtime': 19.5476, 'eval_samples_per_second': 502.107, 'eval_steps_per_second': 7.878, 'epoch': 0.41}\n",
+      "{'loss': 0.4194, 'learning_rate': 5.1148956975228174e-06, 'epoch': 0.49}\n",
+      "{'eval_loss': 0.38966989517211914, 'eval_accuracy': 0.8525725929699439, 'eval_runtime': 19.41, 'eval_samples_per_second': 505.666, 'eval_steps_per_second': 7.934, 'epoch': 0.49}\n",
+      "{'loss': 0.416, 'learning_rate': 4.30003259452412e-06, 'epoch': 0.57}\n",
+      "{'eval_loss': 0.363924115896225, 'eval_accuracy': 0.8604177279673968, 'eval_runtime': 19.6734, 'eval_samples_per_second': 498.896, 'eval_steps_per_second': 7.828, 'epoch': 0.57}\n",
+      "{'loss': 0.4099, 'learning_rate': 3.4859843546284226e-06, 'epoch': 0.65}\n",
+      "{'eval_loss': 0.3566216826438904, 'eval_accuracy': 0.8620478858889455, 'eval_runtime': 19.5395, 'eval_samples_per_second': 502.317, 'eval_steps_per_second': 7.881, 'epoch': 0.65}\n",
+      "{'loss': 0.3995, 'learning_rate': 2.6711212516297265e-06, 'epoch': 0.73}\n",
+      "{'eval_loss': 0.3582080602645874, 'eval_accuracy': 0.8640855832908813, 'eval_runtime': 19.5525, 'eval_samples_per_second': 501.981, 'eval_steps_per_second': 7.876, 'epoch': 0.73}\n",
+      "{'loss': 0.3932, 'learning_rate': 1.8562581486310302e-06, 'epoch': 0.81}\n",
+      "{'eval_loss': 0.35252732038497925, 'eval_accuracy': 0.8660213958227203, 'eval_runtime': 19.3648, 'eval_samples_per_second': 506.847, 'eval_steps_per_second': 7.953, 'epoch': 0.81}\n",
+      "{'loss': 0.3941, 'learning_rate': 1.0422099087353325e-06, 'epoch': 0.9}\n",
+      "{'eval_loss': 0.3504713773727417, 'eval_accuracy': 0.8664289353031075, 'eval_runtime': 19.7085, 'eval_samples_per_second': 498.009, 'eval_steps_per_second': 7.814, 'epoch': 0.9}\n",
+      "{'loss': 0.3965, 'learning_rate': 2.2734680573663624e-07, 'epoch': 0.98}\n",
+      "{'eval_loss': 0.34929943084716797, 'eval_accuracy': 0.8682628629648497, 'eval_runtime': 18.6964, 'eval_samples_per_second': 524.966, 'eval_steps_per_second': 8.237, 'epoch': 0.98}\n",
+      "{'train_runtime': 2756.3926, 'train_samples_per_second': 142.47, 'train_steps_per_second': 4.452, 'train_loss': 0.4411108956964883, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.34929943084716797, 'eval_accuracy': 0.8682628629648497, 'eval_runtime': 18.8099, 'eval_samples_per_second': 521.801, 'eval_steps_per_second': 8.187, 'epoch': 1.0}\n",
+      "{'eval_loss': 0.34929943084716797, 'eval_accuracy': 0.8682628629648497, 'eval_runtime': 18.8099, 'eval_samples_per_second': 521.801, 'eval_steps_per_second': 8.187, 'epoch': 1.0}\n"
      ]
     }
    ],
    "source": [
-    "engine = build_engine(\n",
-    "    runtime=runtime,\n",
-    "    onnx_file_path=\"baseline.onnx\",\n",
-    "    logger=trt_logger,\n",
-    "    min_shape=(batch_size, max_seq_len),\n",
-    "    optimal_shape=(batch_size, max_seq_len),\n",
-    "    max_shape=(batch_size, max_seq_len),\n",
-    "    workspace_size=10000 * 1024 * 1024,\n",
-    "    fp16=True,\n",
-    "    int8=False,\n",
+    "model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_checkpoint, num_labels=num_labels\n",
     ")\n",
-    "stream: Stream = pycuda.driver.Stream()\n",
-    "context: IExecutionContext = engine.create_execution_context()\n",
-    "context.set_optimization_profile_async(profile_index=profile_index, stream_handle=stream.handle)\n",
-    "input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)  # type: List[int], List[int]\n",
-    "for _ in range(30):\n",
-    "    _ = infer_tensorrt(\n",
-    "        context=context,\n",
-    "        host_inputs=input_np,\n",
-    "        input_binding_idxs=input_binding_idxs,\n",
-    "        output_binding_idxs=output_binding_idxs,\n",
-    "        stream=stream,\n",
-    "    )\n",
-    "time_buffer = list()\n",
-    "for _ in range(100):\n",
-    "    with track_infer_time(time_buffer):\n",
-    "        _ = infer_tensorrt(\n",
-    "            context=context,\n",
-    "            host_inputs=input_np,\n",
-    "            input_binding_idxs=input_binding_idxs,\n",
-    "            output_binding_idxs=output_binding_idxs,\n",
-    "            stream=stream,\n",
-    "        )\n",
+    "model_roberta = model_roberta.cuda()\n",
     "\n",
-    "print_timings(name=\"TensorRT (FP16)\", timings=time_buffer)\n",
-    "del engine, context"
+    "args.learning_rate = 1e-5\n",
+    "trainer = Trainer(\n",
+    "    model_roberta,\n",
+    "    args,\n",
+    "    train_dataset=encoded_dataset[\"train\"],\n",
+    "    eval_dataset=encoded_dataset[validation_key],\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")\n",
+    "transformers.logging.set_verbosity_error()\n",
+    "trainer.train()\n",
+    "print(trainer.evaluate())\n",
+    "# {'eval_loss': 0.3559744358062744, 'eval_accuracy': 0.8655119714722364, 'eval_runtime': 19.6678, 'eval_samples_per_second': 499.04, 'eval_steps_per_second': 7.83, 'epoch': 0.98}\n",
+    "trainer.save_model(\"roberta-baseline\")\n",
+    "del model_roberta\n",
+    "del trainer"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Pytorch baseline"
+    "### GPU execution"
    ]
   },
   {
@@ -1755,61 +1773,213 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Pytorch (FP32)] mean=76.31ms, sd=1.04ms, min=75.32ms, max=82.39ms, median=76.02ms, 95p=78.31ms, 99p=79.18ms\n"
+      "[Pytorch (FP32)] mean=83.53ms, sd=3.69ms, min=79.18ms, max=91.07ms, median=84.09ms, 95p=89.34ms, 99p=90.44ms\n"
      ]
     }
    ],
    "source": [
-    "baseline_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)\n",
+    "baseline_model = AutoModelForSequenceClassification.from_pretrained(\"roberta-baseline\", num_labels=num_labels)\n",
     "baseline_model = baseline_model.cuda()\n",
+    "baseline_model = baseline_model.eval()\n",
     "\n",
     "data = encoded_dataset[\"train\"][0:batch_size]\n",
     "input_torch: OD[str, torch.Tensor] = convert_tensor(data=data, output=\"torch\")\n",
     "\n",
-    "for _ in range(30):\n",
-    "    _ = baseline_model(**input_torch)\n",
-    "    torch.cuda.synchronize()\n",
-    "time_buffer = list()\n",
-    "for _ in range(100):\n",
-    "    with track_infer_time(time_buffer):\n",
+    "with torch.inference_mode():\n",
+    "    for _ in range(30):\n",
     "        _ = baseline_model(**input_torch)\n",
     "        torch.cuda.synchronize()\n",
+    "    time_buffer = list()\n",
+    "    for _ in range(100):\n",
+    "        with track_infer_time(time_buffer):\n",
+    "            _ = baseline_model(**input_torch)\n",
+    "            torch.cuda.synchronize()\n",
     "print_timings(name=\"Pytorch (FP32)\", timings=time_buffer)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 52,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Pytorch (FP16)] mean=56.04ms, sd=0.56ms, min=55.44ms, max=57.93ms, median=55.88ms, 95p=57.36ms, 99p=57.62ms\n"
+      "[Pytorch (FP16)] mean=58.78ms, sd=1.59ms, min=57.74ms, max=64.04ms, median=58.15ms, 95p=62.80ms, 99p=63.88ms\n"
      ]
     }
    ],
    "source": [
     "from torch.cuda.amp import autocast\n",
     "\n",
-    "with autocast():\n",
-    "    for _ in range(30):\n",
-    "        _ = baseline_model(**input_torch)\n",
+    "with torch.inference_mode():\n",
+    "    with autocast():\n",
+    "        for _ in range(30):\n",
+    "            _ = baseline_model(**input_torch)\n",
+    "            torch.cuda.synchronize()\n",
+    "        time_buffer = []\n",
+    "        for _ in range(100):\n",
+    "            with track_infer_time(time_buffer):\n",
+    "                _ = baseline_model(**input_torch)\n",
+    "                torch.cuda.synchronize()\n",
+    "print_timings(name=\"Pytorch (FP16)\", timings=time_buffer)\n",
+    "del baseline_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CPU execution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Pytorch (FP32) - CPU] mean=4406.68ms, sd=290.44ms, min=3908.02ms, max=4794.74ms, median=4486.10ms, 95p=4725.07ms, 99p=4780.80ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "baseline_model = AutoModelForSequenceClassification.from_pretrained(\"roberta-baseline\", num_labels=num_labels)\n",
+    "baseline_model = baseline_model.eval()\n",
+    "input_torch_cpu = {k: v.to(\"cpu\") for k, v in input_torch.items()}\n",
+    "\n",
+    "\n",
+    "with torch.inference_mode():\n",
+    "    for _ in range(3):\n",
+    "        _ = baseline_model(**input_torch_cpu)\n",
     "        torch.cuda.synchronize()\n",
-    "    time_buffer = []\n",
-    "    for _ in range(100):\n",
+    "    time_buffer = list()\n",
+    "    for _ in range(10):\n",
     "        with track_infer_time(time_buffer):\n",
-    "            _ = baseline_model(**input_torch)\n",
+    "            _ = baseline_model(**input_torch_cpu)\n",
+    "            torch.cuda.synchronize()\n",
+    "print_timings(name=\"Pytorch (FP32) - CPU\", timings=time_buffer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Pytorch (FP16) - CPU] mean=4255.15ms, sd=123.93ms, min=4103.51ms, max=4527.69ms, median=4206.06ms, 95p=4469.24ms, 99p=4516.00ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "with torch.inference_mode():\n",
+    "    with autocast():\n",
+    "        for _ in range(3):\n",
+    "            _ = baseline_model(**input_torch_cpu)\n",
     "            torch.cuda.synchronize()\n",
-    "print_timings(name=\"Pytorch (FP16)\", timings=time_buffer)"
+    "        time_buffer = []\n",
+    "        for _ in range(10):\n",
+    "            with track_infer_time(time_buffer):\n",
+    "                _ = baseline_model(**input_torch_cpu)\n",
+    "                torch.cuda.synchronize()\n",
+    "print_timings(name=\"Pytorch (FP16) - CPU\", timings=time_buffer)\n",
+    "del baseline_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### TensorRT baseline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below we export a randomly initialized `Roberta` model, the purpose is to only check the performance on mixed precision (FP16, no quantization)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "baseline_model = AutoModelForSequenceClassification.from_pretrained(\"roberta-baseline\", num_labels=num_labels)\n",
+    "baseline_model = baseline_model.cuda()\n",
+    "convert_to_onnx(baseline_model, output_path=\"baseline.onnx\", inputs_pytorch=input_torch, opset=12)\n",
+    "del baseline_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[TensorRT (FP16)] mean=30.23ms, sd=0.25ms, min=29.92ms, max=31.51ms, median=30.14ms, 95p=30.74ms, 99p=30.95ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "engine = build_engine(\n",
+    "    runtime=runtime,\n",
+    "    onnx_file_path=\"baseline.onnx\",\n",
+    "    logger=trt_logger,\n",
+    "    min_shape=(batch_size, max_seq_len),\n",
+    "    optimal_shape=(batch_size, max_seq_len),\n",
+    "    max_shape=(batch_size, max_seq_len),\n",
+    "    workspace_size=10000 * 1024 * 1024,\n",
+    "    fp16=True,\n",
+    "    int8=False,\n",
+    ")\n",
+    "stream: Stream = pycuda.driver.Stream()\n",
+    "context: IExecutionContext = engine.create_execution_context()\n",
+    "context.set_optimization_profile_async(profile_index=profile_index, stream_handle=stream.handle)\n",
+    "input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)  # type: List[int], List[int]\n",
+    "for _ in range(30):\n",
+    "    _ = infer_tensorrt(\n",
+    "        context=context,\n",
+    "        host_inputs=input_np,\n",
+    "        input_binding_idxs=input_binding_idxs,\n",
+    "        output_binding_idxs=output_binding_idxs,\n",
+    "        stream=stream,\n",
+    "    )\n",
+    "time_buffer = list()\n",
+    "for _ in range(100):\n",
+    "    with track_infer_time(time_buffer):\n",
+    "        _ = infer_tensorrt(\n",
+    "            context=context,\n",
+    "            host_inputs=input_np,\n",
+    "            input_binding_idxs=input_binding_idxs,\n",
+    "            output_binding_idxs=output_binding_idxs,\n",
+    "            stream=stream,\n",
+    "        )\n",
+    "\n",
+    "print_timings(name=\"TensorRT (FP16)\", timings=time_buffer)\n",
+    "del engine, context"
    ]
   },
   {
diff --git a/src/transformer_deploy/backends/ort_utils.py b/src/transformer_deploy/backends/ort_utils.py
index 31c59e69..1f83f320 100644
--- a/src/transformer_deploy/backends/ort_utils.py
+++ b/src/transformer_deploy/backends/ort_utils.py
@@ -36,7 +36,9 @@ def create_model_for_provider(path: str, provider_to_use: str) -> InferenceSessi
     return InferenceSession(path, options, providers=provider_to_use)
 
 
-def convert_to_onnx(model_pytorch: PreTrainedModel, output_path: str, inputs_pytorch: OD[str, torch.Tensor]) -> None:
+def convert_to_onnx(
+    model_pytorch: PreTrainedModel, output_path: str, inputs_pytorch: OD[str, torch.Tensor], opset: int = 12
+) -> None:
     # dynamic axis == variable length axis
     dynamic_axis = OrderedDict()
     for k in inputs_pytorch.keys():
@@ -47,7 +49,7 @@ def convert_to_onnx(model_pytorch: PreTrainedModel, output_path: str, inputs_pyt
             model_pytorch,  # model to optimize
             args=tuple(inputs_pytorch.values()),  # tuple of multiple inputs
             f=output_path,  # output path / file object
-            opset_version=13,  # the ONNX version to use
+            opset_version=opset,  # the ONNX version to use, 13 if quantized model, 12 for not quantized ones
             do_constant_folding=True,  # simplify model (replace constant expressions)
             input_names=list(inputs_pytorch.keys()),  # input names
             output_names=["output"],  # output axis name
@@ -65,7 +67,7 @@ def optimize_onnx(onnx_path: str, onnx_optim_fp16_path: str, use_cuda: bool) ->
         model_type="bert",
         use_gpu=use_cuda,
         opt_level=1,
-        num_heads=0,  # automatic detection
+        num_heads=0,  # automatic detection don't work with opset 13
         hidden_size=0,  # automatic detection
         optimization_options=optimization_options,
     )
diff --git a/src/transformer_deploy/convert.py b/src/transformer_deploy/convert.py
index e65c4057..cfc4cfc5 100644
--- a/src/transformer_deploy/convert.py
+++ b/src/transformer_deploy/convert.py
@@ -76,7 +76,7 @@ def main():
         default=["onnx"],
         help="backend to use. One of [onnx,tensorrt, pytorch] or all",
         nargs="*",
-        choices=["onnx", "tensorrt", "pytorch"],
+        choices=["onnx", "tensorrt"],
     )
     parser.add_argument("--nb-instances", default=1, help="# of model instances, may improve troughput", type=int)
     parser.add_argument("--warmup", default=100, help="# of inferences to warm each model", type=int)
@@ -118,9 +118,14 @@ def main():
     logging.info(f"[Pytorch] input shape {inputs_pytorch['input_ids'].shape}")
     logging.info(f"[Pytorch] output shape: {output_pytorch.shape}")
     # create onnx model and compare results
+    opset = 12
     if args.quantization:
         TensorQuantizer.use_fb_fake_quant = True
-    convert_to_onnx(model_pytorch=model_pytorch, output_path=onnx_model_path, inputs_pytorch=inputs_pytorch)
+        opset = 13
+
+    convert_to_onnx(
+        model_pytorch=model_pytorch, output_path=onnx_model_path, inputs_pytorch=inputs_pytorch, opset=opset
+    )
     if args.quantization:
         TensorQuantizer.use_fb_fake_quant = False
     onnx_model = create_model_for_provider(path=onnx_model_path, provider_to_use="CUDAExecutionProvider")

From fc29af8e9b22bbf42ded9d52152c4e1eb6f34c5b Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 14:21:51 +0100
Subject: [PATCH 08/15] bump VERSION

---
 README.md | 1 +
 VERSION   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c2dd4352..2c130775 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@
 * [🐍 TensorRT usage in Python script](#tensorrt-usage-in-python-script)
 * [⏱ benchmarks](#benchmarks)
 * [🤗 end to end reproduction of Infinity Hugging Face demo](./demo/README.md) (to replay [Medium article](https://towardsdatascience.com/hugging-face-transformer-inference-under-1-millisecond-latency-e1be0057a51c?source=friends_link&sk=cd880e05c501c7880f2b9454830b8915))
+* [🏎️ end to end GPU quantization tutorial](./demo/quantization_end_to_end.ipynb)
 
 #### Why this tool?
 
diff --git a/VERSION b/VERSION
index 17e51c38..0ea3a944 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1.1
+0.2.0

From 5630c1ec7e0b8ba80dc42b5535ee15b72a3676bb Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 14:31:22 +0100
Subject: [PATCH 09/15] delete old script

---
 roberta_classic.py                            | 208 ------------------
 .../QDQModels/QDQRoberta.py                   |   8 +-
 2 files changed, 4 insertions(+), 212 deletions(-)
 delete mode 100644 roberta_classic.py

diff --git a/roberta_classic.py b/roberta_classic.py
deleted file mode 100644
index 2393d6eb..00000000
--- a/roberta_classic.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import logging
-
-import numpy as np
-import pytorch_quantization.nn as quant_nn
-import torch
-from datasets import load_dataset, load_metric
-from pytorch_quantization import calib
-from pytorch_quantization.tensor_quant import QuantDescriptor
-from tqdm import tqdm
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    IntervalStrategy,
-    PreTrainedModel,
-    Trainer,
-    TrainingArguments,
-)
-
-from transformer_deploy.backends.ort_utils import convert_to_onnx
-from transformer_deploy.QDQModels.QDQRoberta import QDQRobertaForSequenceClassification
-
-
-logging.getLogger().setLevel(logging.WARNING)
-
-num_labels = 3
-model_checkpoint = "roberta-base"
-batch_size = 32
-validation_key = "validation_matched"
-dataset = load_dataset("glue", "mnli")
-metric = load_metric("glue", "mnli")
-nb_step = 1000
-training_strategy = IntervalStrategy.STEPS
-
-tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
-
-
-def preprocess_function(examples):
-    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding="max_length", max_length=256)
-
-
-def compute_metrics(eval_pred):
-    predictions, labels = eval_pred
-    predictions = np.argmax(predictions, axis=1)
-    return metric.compute(predictions=predictions, references=labels)
-
-
-def fuse_qkv(model, quant_per_tensor: bool):
-    """Adjust quantization ranges to match an implementation where the QKV projections are implemented with a single GEMM.
-    Force the weight and output scale factors to match by taking the max of (Q,K,V).
-    """
-
-    def fuse3(qq, qk, qv):
-        for mod in [qq, qk, qv]:
-            if not hasattr(mod, "_amax"):
-                print("          WARNING: NO AMAX BUFFER")
-                return
-        q = qq._amax.detach().item()
-        k = qk._amax.detach().item()
-        v = qv._amax.detach().item()
-
-        amax = max(q, k, v)
-        qq._amax.fill_(amax)
-        qk._amax.fill_(amax)
-        qv._amax.fill_(amax)
-        print(f"          q={q:5.2f} k={k:5.2f} v={v:5.2f} -> {amax:5.2f}")
-
-    for name, mod in model.named_modules():
-        if name.endswith(".attention.self"):
-            print(f"FUSE_QKV: {name}")
-            fuse3(mod.matmul_q_input_quantizer, mod.matmul_k_input_quantizer, mod.matmul_v_input_quantizer)
-            if quant_per_tensor:
-                fuse3(mod.query._weight_quantizer, mod.key._weight_quantizer, mod.value._weight_quantizer)
-
-
-encoded_dataset = dataset.map(preprocess_function, batched=True)
-
-args = TrainingArguments(
-    f"{model_checkpoint}-finetuned",
-    evaluation_strategy=training_strategy,
-    eval_steps=nb_step,
-    logging_steps=nb_step,
-    save_steps=nb_step,
-    save_strategy=training_strategy,
-    learning_rate=1e-5,  # 7.5e-6 https://github.com/pytorch/fairseq/issues/2057#issuecomment-643674771
-    per_device_train_batch_size=batch_size,
-    per_device_eval_batch_size=batch_size * 2,
-    num_train_epochs=1,
-    fp16=True,
-    group_by_length=False,
-    weight_decay=0.01,
-    load_best_model_at_end=True,
-    metric_for_best_model="accuracy",
-)
-
-model_roberta: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(
-    model_checkpoint, num_labels=num_labels
-)
-model_roberta = model_roberta.cuda()
-
-trainer = Trainer(
-    model_roberta,
-    args,
-    train_dataset=encoded_dataset["train"],
-    eval_dataset=encoded_dataset[validation_key],
-    tokenizer=tokenizer,
-    compute_metrics=compute_metrics,
-)
-print(trainer.evaluate())
-# {'eval_loss': 0.3559744358062744, 'eval_accuracy': 0.8655119714722364, 'eval_runtime': 19.6678, 'eval_samples_per_second': 499.04, 'eval_steps_per_second': 7.83, 'epoch': 0.98}
-trainer.train()
-trainer.save_model("roberta-model")
-del model_roberta
-del trainer
-
-input_desc = QuantDescriptor(num_bits=8, calib_method="histogram")
-# below we do per-channel quantization for weights, set axis to None to get a per tensor calibration
-weight_desc = QuantDescriptor(num_bits=8, axis=(0,))
-quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-
-# keep it on CPU
-model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained("roberta-model")
-
-# Find the TensorQuantizer and enable calibration
-for name, module in tqdm(model_roberta_q.named_modules()):
-    if isinstance(module, quant_nn.TensorQuantizer):
-        if module._calibrator is not None:
-            module.disable_quant()
-            module.enable_calib()
-        else:
-            module.disable()
-
-with torch.no_grad():
-    for start_index in tqdm(range(0, 128, batch_size)):
-        end_index = start_index + batch_size
-        data = encoded_dataset["train"][start_index:end_index]
-        input_torch = {
-            k: torch.tensor(list(v), dtype=torch.long, device="cpu")
-            for k, v in data.items()
-            if k in ["input_ids", "attention_mask", "token_type_ids"]
-        }
-        model_roberta_q(**input_torch)
-
-
-# Finalize calibration
-for name, module in model_roberta_q.named_modules():
-    if isinstance(module, quant_nn.TensorQuantizer):
-        if module._calibrator is not None:
-            if isinstance(module._calibrator, calib.MaxCalibrator):
-                module.load_calib_amax()
-            else:
-                module.load_calib_amax("percentile", percentile=99.99)
-            module.enable_quant()
-            module.disable_calib()
-        else:
-            module.enable()
-
-model_roberta_q.cuda()
-
-model_roberta_q.save_pretrained("roberta-trained-quantized")
-del model_roberta_q
-
-
-model_roberta_q: PreTrainedModel = QDQRobertaForSequenceClassification.from_pretrained(
-    "roberta-trained-quantized", num_labels=num_labels
-)
-model_roberta_q = model_roberta_q.cuda()
-
-args.learning_rate /= 10
-print(f"LR: {args.learning_rate}")
-trainer = Trainer(
-    model_roberta_q,
-    args,
-    train_dataset=encoded_dataset["train"],
-    eval_dataset=encoded_dataset[validation_key],
-    tokenizer=tokenizer,
-    compute_metrics=compute_metrics,
-)
-print(trainer.evaluate())
-# 4 batches
-# {'eval_loss': 0.38076257705688477, 'eval_accuracy': 0.8552215995924605, 'eval_runtime': 46.9577, 'eval_samples_per_second': 209.018, 'eval_steps_per_second': 3.28}
-# 100 batches
-# {'eval_loss': 0.386756956577301, 'eval_accuracy': 0.8516556291390729, 'eval_runtime': 48.9996, 'eval_samples_per_second': 200.308, 'eval_steps_per_second': 3.143}
-trainer.train()
-print(trainer.evaluate())
-# {'eval_loss': 0.40235549211502075, 'eval_accuracy': 0.8589913397860418, 'eval_runtime': 46.1754, 'eval_samples_per_second': 212.559, 'eval_steps_per_second': 3.335, 'epoch': 1.0}
-model_roberta_q.save_pretrained("roberta-in-bert-trained-quantized-retrained")
-
-
-# fuse_qkv(model_roberta_q, quant_per_tensor=True)
-data = encoded_dataset["train"][1:3]
-input_torch = {
-    k: torch.tensor(list(v), dtype=torch.long, device="cuda")
-    for k, v in data.items()
-    if k in ["input_ids", "attention_mask", "token_type_ids"]
-}
-
-from pytorch_quantization.nn import TensorQuantizer
-
-
-TensorQuantizer.use_fb_fake_quant = True
-convert_to_onnx(model_pytorch=model_roberta_q, output_path="roberta_q.onnx", inputs_pytorch=input_torch)
-TensorQuantizer.use_fb_fake_quant = False
-# /usr/src/tensorrt/bin/trtexec --onnx=roberta_q.onnx --shapes=input_ids:1x384,attention_mask:1x384 --best --workspace=6000
-# no fusing
-# Latency: min = 1.85529 ms, max = 4.32666 ms, mean = 1.98449 ms, median = 1.87964 ms, percentile(99%) = 3.19434 ms
-# with fusing
-# Latency: min = 1.84412 ms, max = 2.22266 ms, mean = 1.87675 ms, median = 1.8717 ms, percentile(99%) = 2.07849 ms
diff --git a/src/transformer_deploy/QDQModels/QDQRoberta.py b/src/transformer_deploy/QDQModels/QDQRoberta.py
index a3db43ec..4d9ddc77 100644
--- a/src/transformer_deploy/QDQModels/QDQRoberta.py
+++ b/src/transformer_deploy/QDQModels/QDQRoberta.py
@@ -135,7 +135,7 @@ def forward(
         seq_length = input_shape[1]
 
         # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves # noqa: E501
         # issue #5664
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
@@ -477,7 +477,7 @@ def forward(
         if self.is_decoder and encoder_hidden_states is not None:
             if not hasattr(self, "crossattention"):
                 raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"  # noqa: E501
                 )
 
             # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
@@ -831,7 +831,7 @@ def forward(
         use_cache (:obj:`bool`, `optional`):
             If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
             decoding (see :obj:`past_key_values`).
-        """
+        """  # noqa: E501
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1015,7 +1015,7 @@ def forward(
             >>> outputs = model(**inputs)
 
             >>> prediction_logits = outputs.logits
-        """
+        """  # noqa: E501
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if labels is not None:
             use_cache = False

From a8fa397a3c781cd37335166ca1e783a32408ca87 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 16:15:48 +0100
Subject: [PATCH 10/15] cleaning

---
 demo/quantization_end_to_end.ipynb           | 16 ++++++++--------
 src/transformer_deploy/backends/trt_utils.py |  2 +-
 src/transformer_deploy/convert.py            |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/demo/quantization_end_to_end.ipynb b/demo/quantization_end_to_end.ipynb
index 9793151d..9d407e30 100644
--- a/demo/quantization_end_to_end.ipynb
+++ b/demo/quantization_end_to_end.ipynb
@@ -25,13 +25,13 @@
     "\n",
     "| Framework                  | Precision | Latency (ms) | Accuracy | Speedup   | Hardware |\n",
     "|:---------------------------|-----------|--------------|----------|:----------|:--------:|\n",
-    "| Pytorch                    | FP32      | 4407         | 86.8 %   | X 0.02    | CPU      |\n",
-    "| Pytorch                    | FP16      | 4255         | 86.8 %   | X 0.02    | CPU      |\n",
-    "| Pytorch                    | FP32      | 77           | 86.8 %   | X 1       | GPU      |\n",
-    "| Pytorch                    | FP16      | 58           | 86.8 %   | X 1.3     | GPU      |\n",
-    "| TensorRT                   | FP16      | 30           | 86.8 %   | X 2.6     | GPU      |\n",
-    "| TensorRT (transplantation) | **INT-8** | 15           | 84.8 %   | **X 5.1** | GPU      |\n",
-    "| TensorRT (custom QDQ code) | **INT-8** | 15           | 85.6 %   | **X 5.1** | GPU      |\n",
+    "| Pytorch                    | FP32      | 4407         | 86.8 %   | X 0.02    |   CPU    |\n",
+    "| Pytorch                    | FP16      | 4255         | 86.8 %   | X 0.02    |   CPU    |\n",
+    "| Pytorch                    | FP32      | 77           | 86.8 %   | X 1       |   GPU    |\n",
+    "| Pytorch                    | FP16      | 58           | 86.8 %   | X 1.3     |   GPU    |\n",
+    "| TensorRT                   | FP16      | 30           | 86.8 %   | X 2.6     |   GPU    |\n",
+    "| TensorRT (transplantation) | **INT-8** | 15           | 84.8 %   | **X 5.1** |   GPU    |\n",
+    "| TensorRT (custom QDQ code) | **INT-8** | 15           | 85.6 %   | **X 5.1** |   GPU    |\n",
     "\n",
     "> measures done on a Nvidia RTX 3090 GPU + 12 cores i7 Intel CPU\n",
     "> accuracy obtained after a single epoch, no LR search or any hyper parameter optimization\n",
@@ -43,7 +43,7 @@
     "\n",
     "Basic idea behind model quantization is to replace tensors made of float numbers (usually encoded on 32 bits) by lower precision representation (encoded on 8 bits for Nvidia GPUs), in general integers.\n",
     "Therefore computation is faster and model memory footprint is lower. Making tensor storage smaller makes memory transfer faster... and is also a computation acceleration factor.\n",
-    "This technic is very interesting for its trade-off: you reduce inference time significantly, and in most scenarios it cost close to nothing in accuracy.\n",
+    "This technic is very interesting for its trade-off: you reduce inference time significantly, and in most scenarios it costs close to nothing in accuracy.\n",
     "\n",
     "Replacing float numbers by integers is done through a mapping.\n",
     "This step is called `calibration`, and its purpose is to compute for each tensor or each channel of a tensor (one of its dimensions) a range of all possible values and then define a scale and a distribution center to map float numbers to 8 bits integers.\n",
diff --git a/src/transformer_deploy/backends/trt_utils.py b/src/transformer_deploy/backends/trt_utils.py
index c6ed7058..6cd294b1 100644
--- a/src/transformer_deploy/backends/trt_utils.py
+++ b/src/transformer_deploy/backends/trt_utils.py
@@ -209,7 +209,7 @@ def infer_tensorrt(
     # calculate input shape, bind it, allocate GPU memory for the output
     host_outputs, device_outputs = setup_binding_shapes(context, input_list, input_binding_idxs, output_binding_idxs)
     bindings = device_inputs + device_outputs
-    assert context.execute_async_v2(bindings, stream_handle=stream.handle), "failure during execute inference call"
+    assert context.execute_async_v2(bindings, stream_handle=stream.handle), "failure during execution of inference"
     for h_output, d_output in zip(host_outputs, device_outputs):
         cuda.memcpy_dtoh_async(h_output, d_output)  # GPU to host
     stream.synchronize()  # sync all CUDA ops
diff --git a/src/transformer_deploy/convert.py b/src/transformer_deploy/convert.py
index cfc4cfc5..9b94c9de 100644
--- a/src/transformer_deploy/convert.py
+++ b/src/transformer_deploy/convert.py
@@ -48,8 +48,8 @@ def main():
     parser = argparse.ArgumentParser(
         description="optimize and deploy transformers", formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("-m", "--model", required=True, help="path to model or URL to Hugging Face Hub")
-    parser.add_argument("-t", "--tokenizer", help="path to tokenizer or URL to Hugging Face Hub")
+    parser.add_argument("-m", "--model", required=True, help="path to model or URL to Hugging Face hub")
+    parser.add_argument("-t", "--tokenizer", help="path to tokenizer or URL to Hugging Face hub")
     parser.add_argument(
         "-b",
         "--batch-size",

From dc175d87c17cf10e7e88b4e117c2d0a4141db0b0 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 17:53:50 +0100
Subject: [PATCH 11/15] fix ORT to 1.9.0, 1.10.0 seems to be bugged

---
 requirements_gpu.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements_gpu.txt b/requirements_gpu.txt
index 65017090..2fad68e0 100644
--- a/requirements_gpu.txt
+++ b/requirements_gpu.txt
@@ -1,10 +1,9 @@
 onnx
-onnxruntime-gpu
+onnxruntime-gpu==1.9.0
 nvidia-pyindex
 tritonclient[all]
 pycuda
 torch==1.10.0+cu113
-nvidia-pyindex
 nvidia-tensorrt
 onnx_graphsurgeon
 polygraphy

From 8cf0e0dc07505b1d4fe0f8b3bf43ac7371517250 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 22:59:53 +0100
Subject: [PATCH 12/15] modify text

---
 README.md                          |  9 ++++
 demo/quantization_end_to_end.ipynb | 70 +++++++++++++-----------------
 2 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index 2c130775..8fe6666d 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,15 @@ With the single command below, you will:
 
 ```shell
 convert_model -m roberta-large-mnli --backend tensorrt onnx --seq-len 16 128 128 --batch-size 1 32 32
+# ...
+# Inference done on NVIDIA GeForce RTX 3090
+# latencies:
+# [Pytorch (FP32)] mean=123.26ms, sd=3.35ms, min=117.84ms, max=136.12ms, median=122.09ms, 95p=129.50ms, 99p=131.24ms
+# [Pytorch (FP16)] mean=78.41ms, sd=2.83ms, min=75.58ms, max=88.48ms, median=77.28ms, 95p=84.66ms, 99p=85.97ms
+# [TensorRT (FP16)] mean=182.99ms, sd=3.15ms, min=175.75ms, max=191.58ms, median=182.32ms, 95p=188.37ms, 99p=190.80ms
+# [ONNX Runtime (vanilla)] mean=119.03ms, sd=8.27ms, min=112.15ms, max=185.57ms, median=116.51ms, 95p=129.18ms, 99p=167.70ms
+# [ONNX Runtime (optimized)] mean=53.82ms, sd=0.81ms, min=52.79ms, max=58.27ms, median=53.74ms, 95p=55.38ms, 99p=57.29ms
+
 ```
 
 > **16 128 128** -> minimum, optimal, maximum sequence length, to help TensorRT better optimize your model  
diff --git a/demo/quantization_end_to_end.ipynb b/demo/quantization_end_to_end.ipynb
index 9d407e30..171a88fe 100644
--- a/demo/quantization_end_to_end.ipynb
+++ b/demo/quantization_end_to_end.ipynb
@@ -17,9 +17,9 @@
     "* it takes less memory\n",
     "* computation is easier / faster\n",
     "\n",
-    "It can be applied to any model in theory, and, if done well, it should not decrease model accuracy.\n",
+    "It can be applied to any model in theory, and, if done well, it should not decrease its accuracy.\n",
     "\n",
-    "The purpose of this tutorial is to show 2 processes to perform quantization on most `transformer` architecture.\n",
+    "The purpose of this tutorial is to show 2 processes to perform quantization on most `transformer` architectures.\n",
     "\n",
     "**TL;DR, inference is 5 times faster on a `Roberta-base` model** with a batch of size 32 / seq len 256, benchmark on MNLI datasets (bold -> **quantization**):\n",
     "\n",
@@ -41,9 +41,9 @@
     "\n",
     "## A (very) short intro to INT-8 quantization\n",
     "\n",
-    "Basic idea behind model quantization is to replace tensors made of float numbers (usually encoded on 32 bits) by lower precision representation (encoded on 8 bits for Nvidia GPUs), in general integers.\n",
-    "Therefore computation is faster and model memory footprint is lower. Making tensor storage smaller makes memory transfer faster... and is also a computation acceleration factor.\n",
-    "This technic is very interesting for its trade-off: you reduce inference time significantly, and in most scenarios it costs close to nothing in accuracy.\n",
+    "Basic idea behind model quantization is to replace tensors made of float numbers (usually encoded on 32 bits) by lower precision representation (integers encoded on 8 bits for Nvidia GPUs).\n",
+    "Therefore computation is faster and model memory footprint is lower. Making tensor storage smaller makes memory transfer faster... and is also a source of computation acceleration.\n",
+    "This technic is very interesting for its trade-off: you reduce inference time significantly, and when dataset is large enough, it costs close to nothing in accuracy.\n",
     "\n",
     "Replacing float numbers by integers is done through a mapping.\n",
     "This step is called `calibration`, and its purpose is to compute for each tensor or each channel of a tensor (one of its dimensions) a range of all possible values and then define a scale and a distribution center to map float numbers to 8 bits integers.\n",
@@ -55,28 +55,32 @@
     "* statically, after training (`post training quantization` or `PTQ`): this way is efficient, but it may have a significant accuracy cost,\n",
     "* statically, before training (`quantization aware training` or `QAT`): this way is efficient and has a low accuracy cost as the weights will take care of the result\n",
     "\n",
-    "In this guide we will focus on the third option: QAT.\n",
+    "In this guide we will focus on the third option: `QAT`.\n",
     "\n",
-    "During a quantization aware training:\n",
+    "During the quantization aware *training*:\n",
     "\n",
-    "* in the inside, Pytorch will work with high precision float numbers,\n",
+    "* in the inside, Pytorch will train with high precision float numbers,\n",
     "* on the outside, Pytorch will simulate that a quantization has already been applied and output results accordingly (for loss computation for instance)\n",
     "* it will also refine the quantization mapping (scale, range, distribution center, etc.)\n",
     "\n",
+    "The simulation process is done through the add of quantization / dequantization nodes, most often called `QDQ`, it's an abbreviation you will see often in quantization world.\n",
+    "\n",
     "You can check this [high quality blog post](https://leimao.github.io/article/Neural-Networks-Quantization/) for more information.\n",
     "\n",
     "## Why a dedicated tutorial?\n",
     "\n",
-    "CPU quantization is supported out of the box by `Pytorch` or `ONNX Runtime`.\n",
+    "CPU quantization is supported out of the box by `Pytorch` and `ONNX Runtime`.\n",
     "**GPU quantization on the other side requires specific tools and process to be applied**.\n",
     "\n",
-    "In the specific case of `transformer` models, right now (december 2021), the only way shown by Nvidia is to build manually the graph of our models in `TensorRT`. This is a low level approach, based on GPU capacity knowledge (which operators are supported, etc.). It's certainly out of reach of most NLP practitioners and is very time consuming to update/adapt to new architectures.\n",
+    "In the specific case of `transformer` models, until recently (december 2021), the only way shown by Nvidia is to build manually the graph of our models in `TensorRT`. This is a low level approach, based on GPU capacity knowledge (which operators are supported, etc.). It's certainly out of reach of most NLP practitioners and is very time consuming to update/adapt to new architectures.\n",
     "\n",
-    "Hopefully, Nvidia recently added to Hugging Face `transformer` library a new model called `QDQBert`.\n",
+    "Hopefully, Nvidia added to Hugging Face `transformer` library a new model called `QDQBert` few weeks ago.\n",
     "Basically, it's a vanilla `Bert` architecture which supports INT-8 quantization.\n",
     "It doesn't support any other architecture out of the box, like `Albert`, `Roberta`, or `Electra`.\n",
     "Nvidia also provide a demo dedicated to the SQuaD task.\n",
     "\n",
+    "This open the door to extension of the approach to other architectures.\n",
+    "\n",
     "To be both simple and cover most use cases, in this tutorial we will see:\n",
     "\n",
     "* how to perform GPU quantization on **any** transformer model (not just Bert) using a simple trick, a `transplatation`\n",
@@ -98,7 +102,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We install `master` branch of `transfomers` library to use a new model: **QDQBert** and `transformer-deploy` to leverage `TensorRT` models (TensorRT API is not something simple to master, it's highly advised to use a wrapper)."
+    "We install `master` branch of `transfomers` library to use a new model: **QDQBert** and `transformer-deploy` to leverage `TensorRT` models (TensorRT API is not something simple to master, it's highly advised to use a wrapper). Your machine should have Nvidia CUDA 11.X, TensorRT 8.2.1 and cuBLAS installed. It's said to be tricky to install, in my experience, just follow Nvidia instructions **and nothing else**, it should work out of the box. Docker image with TensorRT 8.2.1 has not yet been released, this tuto will be updated when it's ready."
    ]
   },
   {
@@ -380,14 +384,7 @@
     "id": "YVx71GdAIrJH"
    },
    "source": [
-    "Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.\n",
-    "\n",
-    "To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:\n",
-    "\n",
-    "- we get a tokenizer that corresponds to the model architecture we want to use,\n",
-    "- we download the vocabulary used when pretraining this specific checkpoint.\n",
-    "\n",
-    "That vocabulary will be cached, so it's not downloaded again the next time we run the cell."
+    "Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires."
    ]
   },
   {
@@ -679,7 +676,7 @@
     "Better models appeared, and most of the work has been done to improve the pretraining step (aka the weights).\n",
     "So the idea will be to take the weights from those new models and put them inside `Bert` architecture.\n",
     "\n",
-    "The process described below should work for most users.\n",
+    "The process described below should work for most architectures.\n",
     "\n",
     "**steps**:\n",
     "\n",
@@ -689,7 +686,7 @@
     "* replace weight/layer names with those from `Roberta`\n",
     "* override the architecture name in model configuration\n",
     "\n",
-    "If there is no 1 to 1 correspondance (it happens), try to keep at least embeddings and self attention. Of course, it's possible that if a model is very different, the transplant may cost some accuracy. In our experience, if your trainset is big enough it should not happen.\n"
+    "If there is no 1 to 1 correspondance (it happens), try to keep at least token embeddings and self attention. Of course, it's possible that if a model is very different, the transplant may cost some accuracy. In our experience, if your trainset is big enough it should not happen.\n"
    ]
   },
   {
@@ -757,15 +754,8 @@
    "source": [
     "## Model training\n",
     "\n",
-    "\n",
-    "When you create a classification model from a pretrained one, the last layer are randomly initialized.\n",
-    "We don't want to take these totally random values to compute the calibration of tensors.\n",
-    "Moreover, our trainset is a bit small, and it's easy to overfit.\n",
-    "\n",
-    "Therefore, we train our `Roberta into Bert` model on 1/6 of the train set.\n",
-    "The goal is to slightly update the weights to the new architecture, not to get the best score.\n",
-    "\n",
-    "> another approach is to fully train your model, perform calibration, and then retrain it on a small part of the data with a low learning rate (usually 1/10 of the original one).\n"
+    "The goal is to update weights to the new architecture, not to get the best score.\n",
+    "For instance, position embeddings are not managed the same way on Bert and Roberta. We need to relearn those parts."
    ]
   },
   {
@@ -839,7 +829,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Quantization"
+    "## Quantization"
    ]
   },
   {
@@ -859,19 +849,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Calibration"
+    "### Calibration"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Activate histogram calibration\n",
+    "#### Activate histogram calibration\n",
     "\n",
     "There are several kinds of calbrators, below we use the percentile one (99.99p) (`histogram`), basically, its purpose is to just remove the most extreme values before computing range / scale.\n",
     "The other option is `max`, it's much faster but expect lower accuracy.\n",
     "\n",
-    "Second calibration option, choose between calibration done at the tensor level or per channel (more fine grained, slower)."
+    "Second calibration option, choose between calibration done at the tensor level or per channel (fine grained, slower)."
    ]
   },
   {
@@ -892,13 +882,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Perform calibration\n",
+    "#### Perform calibration\n",
     "\n",
     "During this step we will enable the calibration nodes, and pass some representative data to the model.\n",
     "It will then be used to compute the scale/range.\n",
     "\n",
     "Official recommendations from Nvidia is to calibrate over thousands of examples from the validation set.\n",
-    "Here we use 40*32 examples, because it's a slow process. It's enough to be close from the original accuracy, on your use case, follow Nvidia process."
+    "Here we use 128 examples because it's a slow process. It's enough to be close from the original accuracy."
    ]
   },
   {
@@ -1279,12 +1269,10 @@
     "\n",
     "The process below is a bit simpler than the method 1:\n",
     "\n",
-    "<!-- * finetune the QDQ model on the task (Quantization Aware Training) -->\n",
-    "* calibrate\n",
+    "* Calibrate\n",
     "* Quantization Aware training (QAT)\n",
     "\n",
-    "\n",
-    "### Fine tuning the model"
+    "> there are many ways to get a QDQ model, you can modify Pytorch source code like here, patch ONNX graph (this approach is used at Microsoft for instance) or leverage the new FX Pytorch interface. Modifying the source code is the most straight forward so we choosed to do it that way.\n"
    ]
   },
   {

From 200b4d3dfaa217119babc5b33efcbaa699f75f8a Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 23:32:55 +0100
Subject: [PATCH 13/15] update tuto

---
 demo/quantization_end_to_end.ipynb | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/demo/quantization_end_to_end.ipynb b/demo/quantization_end_to_end.ipynb
index 171a88fe..fe057863 100644
--- a/demo/quantization_end_to_end.ipynb
+++ b/demo/quantization_end_to_end.ipynb
@@ -33,10 +33,10 @@
     "| TensorRT (transplantation) | **INT-8** | 15           | 84.8 %   | **X 5.1** |   GPU    |\n",
     "| TensorRT (custom QDQ code) | **INT-8** | 15           | 85.6 %   | **X 5.1** |   GPU    |\n",
     "\n",
-    "> measures done on a Nvidia RTX 3090 GPU + 12 cores i7 Intel CPU\n",
-    "> accuracy obtained after a single epoch, no LR search or any hyper parameter optimization\n",
-    "> CPU measures are unfair but still indicative of what kind of perf to expect from Pytorch+CPU deployment\n",
-    "> same kind of acceleration is observed on all seq len / batch sizes\n",
+    "> measures done on a Nvidia RTX 3090 GPU + 12 cores i7 Intel CPU  \n",
+    "> accuracy obtained after a single epoch, no LR search or any hyper parameter optimization  \n",
+    "> CPU measures are unfair but still indicative of what kind of perf to expect from Pytorch+CPU deployment  \n",
+    "> same kind of acceleration is observed on all seq len / batch sizes  \n",
     "\n",
     "\n",
     "## A (very) short intro to INT-8 quantization\n",
@@ -61,7 +61,6 @@
     "\n",
     "* in the inside, Pytorch will train with high precision float numbers,\n",
     "* on the outside, Pytorch will simulate that a quantization has already been applied and output results accordingly (for loss computation for instance)\n",
-    "* it will also refine the quantization mapping (scale, range, distribution center, etc.)\n",
     "\n",
     "The simulation process is done through the add of quantization / dequantization nodes, most often called `QDQ`, it's an abbreviation you will see often in quantization world.\n",
     "\n",

From 1d12dcfc8aecea4756855d80755d43ca2e5bb492 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 23:36:02 +0100
Subject: [PATCH 14/15] update tuto

---
 demo/quantization_end_to_end.ipynb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/demo/quantization_end_to_end.ipynb b/demo/quantization_end_to_end.ipynb
index fe057863..88b07cb0 100644
--- a/demo/quantization_end_to_end.ipynb
+++ b/demo/quantization_end_to_end.ipynb
@@ -33,10 +33,10 @@
     "| TensorRT (transplantation) | **INT-8** | 15           | 84.8 %   | **X 5.1** |   GPU    |\n",
     "| TensorRT (custom QDQ code) | **INT-8** | 15           | 85.6 %   | **X 5.1** |   GPU    |\n",
     "\n",
-    "> measures done on a Nvidia RTX 3090 GPU + 12 cores i7 Intel CPU  \n",
-    "> accuracy obtained after a single epoch, no LR search or any hyper parameter optimization  \n",
-    "> CPU measures are unfair but still indicative of what kind of perf to expect from Pytorch+CPU deployment  \n",
-    "> same kind of acceleration is observed on all seq len / batch sizes  \n",
+    "> measures done on a Nvidia RTX 3090 GPU + 12 cores i7 Intel CPU\n",
+    "> accuracy obtained after a single epoch, no LR search or any hyper parameter optimization\n",
+    "> CPU measures are unfair (no try to optimize inference speed at all) but still indicative of what kind of perf to expect from Pytorch+CPU deployment\n",
+    "> same kind of acceleration is observed on all seq len / batch sizes\n",
     "\n",
     "\n",
     "## A (very) short intro to INT-8 quantization\n",

From 524ff6ba68665224c25d9bdb8a411e64a144ed00 Mon Sep 17 00:00:00 2001
From: pommedeterresautee <pommedeterresautee@msn.com>
Date: Wed, 8 Dec 2021 23:41:18 +0100
Subject: [PATCH 15/15] update tuto

---
 demo/quantization_end_to_end.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/demo/quantization_end_to_end.ipynb b/demo/quantization_end_to_end.ipynb
index 88b07cb0..1d33edce 100644
--- a/demo/quantization_end_to_end.ipynb
+++ b/demo/quantization_end_to_end.ipynb
@@ -33,9 +33,9 @@
     "| TensorRT (transplantation) | **INT-8** | 15           | 84.8 %   | **X 5.1** |   GPU    |\n",
     "| TensorRT (custom QDQ code) | **INT-8** | 15           | 85.6 %   | **X 5.1** |   GPU    |\n",
     "\n",
-    "> measures done on a Nvidia RTX 3090 GPU + 12 cores i7 Intel CPU\n",
-    "> accuracy obtained after a single epoch, no LR search or any hyper parameter optimization\n",
-    "> CPU measures are unfair (no try to optimize inference speed at all) but still indicative of what kind of perf to expect from Pytorch+CPU deployment\n",
+    "> measures done on a Nvidia RTX 3090 GPU + 12 cores i7 Intel CPU  \n",
+    "> accuracy obtained after a single epoch, no LR search or any hyper parameter optimization  \n",
+    "> CPU measures are unfair (no try to optimize inference speed at all) but still indicative of what kind of perf to expect from Pytorch+CPU deployment  \n",
     "> same kind of acceleration is observed on all seq len / batch sizes\n",
     "\n",
     "\n",