From f217f55ae3bfa52592e7a24b338a1d55e6dc4771 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Thu, 15 Jun 2023 17:13:17 +0800
Subject: [PATCH 1/9] add low-mem-merge script

---
 .../merge_llama_with_chinese_lora_low_mem.py  | 334 ++++++++++++++++++
 1 file changed, 334 insertions(+)
 create mode 100644 scripts/merge_llama_with_chinese_lora_low_mem.py

diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
new file mode 100644
index 0000000..a770a2c
--- /dev/null
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -0,0 +1,334 @@
+"""
+Usage: 
+python merge_llama_with_chinese_lora_low_mem.py \
+    --base_model path/to/llama/model \
+    --lora_model path/to/first/lora/model[,path/to/second/lora/model] \
+    --output_type [pth|huggingface] \
+    --output_dir path/to/output/dir
+"""
+import argparse
+import json
+import os
+import gc
+import torch
+import peft
+from transformers import LlamaConfig, LlamaTokenizer
+from transformers.modeling_utils import dtype_byte_size
+from huggingface_hub import snapshot_download
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--base_model', default=None, required=True,
+                    type=str, help="Please specify a base_model")
+parser.add_argument('--lora_model', default=None, required=True,
+                    type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models.")
+parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], type=str,
+                    help="save the merged model in pth or huggingface format.")
+parser.add_argument('--output_dir', default='./', type=str)
+
+
+emb_to_model_size = {
+    4096 : '7B',
+    5120 : '13B',
+    6656 : '33B',
+    8192 : '65B',
+}
+num_shards_of_models = {'7B': 1, '13B': 2, '33B': 4, '65B': 8}
+params_of_models = {
+    '7B':
+        {
+        "dim": 4096,
+        "multiple_of": 256,
+        "n_heads": 32,
+        "n_layers": 32,
+        "norm_eps": 1e-06,
+        "vocab_size": -1,
+        },
+    '13B':
+        {
+        "dim": 5120,
+        "multiple_of": 256,
+        "n_heads": 40,
+        "n_layers": 40,
+        "norm_eps": 1e-06,
+        "vocab_size": -1,
+        },
+    '33B':
+        {
+        "dim": 6656,
+        "multiple_of": 256,
+        "n_heads": 52,
+        "n_layers": 60,
+        "norm_eps": 1e-06,
+        "vocab_size": -1,
+        },
+    '65B':
+        {
+        "dim": 8192,
+        "multiple_of": 256,
+        "n_heads": 64,
+        "n_layers": 80,
+        "norm_eps": 1e-05,
+        "vocab_size": -1,
+        },
+}
+
+def transpose(weight, fan_in_fan_out):
+    return weight.T if fan_in_fan_out else weight
+
+# Borrowed and modified from https://github.com/tloen/alpaca-lora
+def translate_state_dict_key(k):
+    k = k.replace("base_model.model.", "")
+    if k == "model.embed_tokens.weight":
+        return "tok_embeddings.weight"
+    elif k == "model.norm.weight":
+        return "norm.weight"
+    elif k == "lm_head.weight":
+        return "output.weight"
+    elif k.startswith("model.layers."):
+        layer = k.split(".")[2]
+        if k.endswith(".self_attn.q_proj.weight"):
+            return f"layers.{layer}.attention.wq.weight"
+        elif k.endswith(".self_attn.k_proj.weight"):
+            return f"layers.{layer}.attention.wk.weight"
+        elif k.endswith(".self_attn.v_proj.weight"):
+            return f"layers.{layer}.attention.wv.weight"
+        elif k.endswith(".self_attn.o_proj.weight"):
+            return f"layers.{layer}.attention.wo.weight"
+        elif k.endswith(".mlp.gate_proj.weight"):
+            return f"layers.{layer}.feed_forward.w1.weight"
+        elif k.endswith(".mlp.down_proj.weight"):
+            return f"layers.{layer}.feed_forward.w2.weight"
+        elif k.endswith(".mlp.up_proj.weight"):
+            return f"layers.{layer}.feed_forward.w3.weight"
+        elif k.endswith(".input_layernorm.weight"):
+            return f"layers.{layer}.attention_norm.weight"
+        elif k.endswith(".post_attention_layernorm.weight"):
+            return f"layers.{layer}.ffn_norm.weight"
+        elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
+            return None
+        else:
+            print(layer, k)
+            raise NotImplementedError
+    else:
+        print(k)
+        raise NotImplementedError
+
+
+def unpermute(w):
+    return (
+        w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
+    )
+
+
+def save_shards(model_sd, num_shards: int, prefix=""):
+    # Add the no_grad context manager
+    with torch.no_grad():
+        if num_shards == 1:
+            new_state_dict = {}
+            for k, v in model_sd.items():
+                new_k = translate_state_dict_key(k)
+                if new_k is not None:
+                    if "wq" in new_k or "wk" in new_k:
+                        new_state_dict[new_k] = unpermute(v)
+                    else:
+                        new_state_dict[new_k] = v
+
+            os.makedirs(output_dir, exist_ok=True)
+            print(f"Saving shard 1 of {num_shards} into {output_dir}/{prefix}consolidated.00.pth")
+            torch.save(new_state_dict, output_dir + f"/{prefix}consolidated.00.pth")
+        else:
+            new_state_dicts = [dict() for _ in range(num_shards)]
+            for k in list(model_sd.keys()):
+                v = model_sd[k]
+                new_k = translate_state_dict_key(k)
+                if new_k is not None:
+                    if new_k=='tok_embeddings.weight':
+                        print(f"Processing {new_k}")
+                        assert v.size(1)%num_shards==0
+                        splits = v.split(v.size(1)//num_shards,dim=1)
+                    elif new_k=='output.weight':
+                        print(f"Processing {new_k}")
+                        if v.size(0)%num_shards==0:
+                            splits = v.split(v.size(0)//num_shards,dim=0)
+                        else:
+                            size_list = [v.size(0)//num_shards] * num_shards
+                            size_list[-1] += v.size(0)%num_shards
+                            splits = v.split(size_list, dim=0) # 13B: size_list == [24976,24977]
+                    elif new_k=='norm.weight':
+                        print(f"Processing {new_k}")
+                        splits = [v] * num_shards
+                    elif 'ffn_norm.weight' in new_k:
+                        print(f"Processing {new_k}")
+                        splits = [v] * num_shards
+                    elif 'attention_norm.weight' in new_k:
+                        print(f"Processing {new_k}")
+                        splits = [v] * num_shards
+
+
+                    elif 'w1.weight' in new_k:
+                        print(f"Processing {new_k}")
+                        splits = v.split(v.size(0)//num_shards,dim=0)
+                    elif 'w2.weight' in new_k:
+                        print(f"Processing {new_k}")
+                        splits = v.split(v.size(1)//num_shards,dim=1)
+                    elif 'w3.weight' in new_k:
+                        print(f"Processing {new_k}")
+                        splits = v.split(v.size(0)//num_shards,dim=0)
+
+
+                    elif 'wo.weight' in new_k:
+                        print(f"Processing {new_k}")
+                        splits = v.split(v.size(1)//num_shards,dim=1)
+
+                    elif 'wv.weight' in new_k:
+                        print(f"Processing {new_k}")
+                        splits = v.split(v.size(0)//num_shards,dim=0)
+
+                    elif "wq.weight" in new_k or "wk.weight" in new_k:
+                        print(f"Processing {new_k}")
+                        v = unpermute(v)
+                        splits = v.split(v.size(0)//num_shards,dim=0)
+                    else:
+                        print(f"Unexpected key {new_k}")
+                        raise ValueError
+                    for sd,split in zip(new_state_dicts,splits):
+                        sd[new_k] = split.clone()
+                        del split
+                    del splits
+                del model_sd[k],v
+                gc.collect()    # Effectively enforce garbage collection
+
+            os.makedirs(output_dir, exist_ok=True)
+            for i,new_state_dict in enumerate(new_state_dicts):
+                print(f"Saving shard {i+1} of {num_shards} into {output_dir}/{prefix}consolidated.0{i}.pth")
+                torch.save(new_state_dict, output_dir + f"/{prefix}consolidated.0{i}.pth")
+
+def merge_shards(output_dir, num_shards: int):
+    ckpt_filenames = sorted([f for f in os.listdir(output_dir) if re.match('L(\d+)-consolidated.(\d+).pth',f)])
+
+    for i in range(num_shards):
+        shards_filenames = sorted([f for f in ckpt_filenames if re.match(f'L(\d+)-consolidated.0{i}.pth',f)])
+        print(f"Loading {shards_filenames} ...")
+        shards_dicts = [torch.load(os.path.join(output_dir,fn)) for fn in shards_filenames]
+        shards_merged = {}
+        for d in shards_dicts:
+            shards_merged |= d
+    
+        print(f"Saving the merged shard to " + os.path.join(output_dir, f"consolidated.0{i}.pth"))
+        torch.save(shards_merged, os.path.join(output_dir, f"consolidated.0{i}.pth"))
+
+        print("Cleaning up...")
+        del shards_merged
+        for d in shards_dicts:
+            del d
+        del shards_dicts
+        gc.collect()    # Effectively enforce garbage collection
+        for fn in shards_filenames:
+            os.remove(os.path.join(output_dir,fn))
+
+if __name__=='__main__':
+
+    args = parser.parse_args()
+    base_model_path = args.base_model
+    lora_model_paths = [s.strip() for s in args.lora_model.split(',') if len(s.strip())!=0]
+    output_dir = args.output_dir
+    output_type = args.output_type
+    os.makedirs(output_dir, exist_ok=True)
+
+    print(f"Base model: {base_model_path}")
+    print(f"LoRA model(s) {lora_model_paths}:")
+
+    tokenizers_and_loras = []
+    for lora_model_path in lora_model_paths:
+        print(f"Loading {lora_model_path}")
+        if not os.path.exists(lora_model_path):
+            print("Cannot find lora model on the disk. Downloading lora model from hub...")
+            lora_model_path = snapshot_download(repo_id=lora_model_path)
+        tokenizer = LlamaTokenizer.from_pretrained(lora_model_path)
+        lora_config = peft.LoraConfig.from_pretrained(lora_model_path)
+        lora_state_dict = torch.load(os.path.join(lora_model_path,'adapter_model.bin'),map_location='cpu')
+        tokenizers_and_loras.append(
+            {
+                "tokenizer"  :tokenizer,
+                "state_dict" :lora_state_dict,
+                "config": lora_config,
+                "scaling": lora_config.lora_alpha / lora_config.r,
+                "fan_in_fan_out" : lora_config.fan_in_fan_out,
+            })
+
+    if not os.path.exists(base_model_path):
+        print("Cannot find lora model on the disk. Downloading lora model from hub...")
+        base_model_path = snapshot_download(repo_id=base_model_path)
+    ckpt_filenames = sorted([f for f in os.listdir(base_model_path) if re.match('pytorch_model-(\d+)-of-(\d+).bin',f)])
+
+    embedding_size = None
+    model_size = None
+
+
+    total_size = 0
+    for index, filename in enumerate(ckpt_filenames):
+        print(f"Loading ckpt {filename}")
+        state_dict = torch.load(os.path.join(base_model_path,filename), map_location='cpu')
+        if index == 0:
+            embedding_size = state_dict['model.embed_tokens.weight'].shape[1]
+            model_size = emb_to_model_size[embedding_size]
+            if output_type=='pth':
+                params = params_of_models[model_size]
+                num_shards = num_shards_of_models[model_size]
+                n_layers = params["n_layers"]
+                n_heads = params["n_heads"]
+                dim = params["dim"]
+                dims_per_head = dim // n_heads
+                base = 10000.0
+                inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+        for k in state_dict:
+            for ti, tandl in enumerate(tokenizers_and_loras):
+                saved_key = 'base_model.model.'+k
+                lora_key_A = saved_key.replace('.weight','.lora_A.weight')
+                if saved_key in tandl['state_dict']:
+                    print(f"copying {saved_key} from {ti}-th LoRA weight to {k}")
+                    state_dict[k] = tandl['state_dict'][saved_key].half().clone() # do we need half()?
+                if lora_key_A in tandl['state_dict']:
+                    lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight')
+                    print(f"merging {lora_key_A} and lora_B.weight form {ti}-th LoRA weight to {k}")
+                    state_dict[k] += (
+                        transpose(
+                            tandl['state_dict'][lora_key_B].float() 
+                          @ tandl['state_dict'][lora_key_A].float(), tandl['fan_in_fan_out']) * tandl['scaling']
+                    )
+            weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype)
+            total_size += weight_size
+
+        # did we do anything?
+        # assert not torch.allclose(first_weight_old, first_weight)
+        # first_weight = base_model.model.layers[0].self_attn.q_proj.weight
+        # first_weight_old = first_weight.clone()
+        if output_type=='huggingface':
+            print(f"Saving ckpt {filename} to {output_dir} in HF format...")
+            torch.save(state_dict,os.path.join(output_dir, filename))
+        elif output_type=='pth':
+            print(f"Saving ckpt {filename} to {output_dir} in pth format...")
+            save_shards(model_sd=state_dict, num_shards=num_shards,prefix=f"L{index+1}-")
+        del state_dict
+        gc.collect()    # Effectively enforce garbage collection
+
+
+    print(f"Saving tokenizer")
+    tokenizers_and_loras[-1]['tokenizer'].save_pretrained(output_dir)
+    if output_type == 'pth':
+        with open(output_dir + "/params.json", "w") as f:
+            print(f"Saving params.json into {output_dir}/params.json")
+            json.dump(params, f)
+        merge_shards(output_dir, num_shards=num_shards)
+
+    if output_type=='huggingface':
+        configs = ('config.json', 'generation_config.json', 'pytorch_model.bin.index.json')
+        for config in configs:
+            if os.path.exists(os.path.join(base_model_path, config)):
+                print(f"Saving {config}")
+                obj = json.load(open(os.path.join(base_model_path, config)))
+                if config=='pytorch_model.bin.index.json':
+                    obj['metadata']['total_size'] = total_size
+                json.dump(obj, open(os.path.join(output_dir, config),'w'), indent=2)
+    print("Done.")
\ No newline at end of file

From ddfe5b80323c3740548039ab2475efcc669b2c63 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Thu, 15 Jun 2023 17:17:44 +0800
Subject: [PATCH 2/9] update new conversion notebook (low-mem)

---
 notebooks/README.md                           |   22 +-
 ...ert_and_quantize_chinese_alpaca_plus.ipynb | 1171 ----------
 .../convert_and_quantize_chinese_llama.ipynb  | 1874 -----------------
 3 files changed, 10 insertions(+), 3057 deletions(-)
 delete mode 100644 notebooks/convert_and_quantize_chinese_alpaca_plus.ipynb
 delete mode 100644 notebooks/convert_and_quantize_chinese_llama.ipynb

diff --git a/notebooks/README.md b/notebooks/README.md
index 149f3a6..27b4486 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -1,20 +1,12 @@
 # 笔记本示例 Notebooks
 
-###  convert_and_quantize_chinese_llama.ipynb
+###  convert_and_quantize_chinese_llama_and_alpaca.ipynb
 
-Colab上的转换和量化中文LLaMA/Alpaca的运行示例（仅供流程参考）。
+Colab上的转换和量化中文LLaMA/Alpaca（含Plus版本）的运行示例（仅供流程参考）。
 
 Example of conversion and quantization for Chinese LLaMA/Alpaca.
 
-建议查看Colab上的最新版 / Check latest notebook：<a href="https://colab.research.google.com/drive/1Eak6azD3MLeb-YsfbP8UZC8wrL1ddIMI?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
-
-###  convert_and_quantize_chinese_alpaca_plus.ipynb
-
-Colab上的转换和量化中文Alpaca-Plus的运行示例（仅供流程参考）。
-
-Example of conversion and quantization for Chinese Alpaca-Plus.
-
-建议查看Colab上的最新版 / Check latest notebook：<a href="https://colab.research.google.com/drive/1axIgPoThgm-v3rglmRV9QnhVsJKHsHBj?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+建议查看Colab上的最新版 / Check latest notebook：<a href="https://colab.research.google.com/drive/1FnFkyKhrnS7s-2lDDeous-AutdI_SkAd?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 
 ### pretrain_chinese_llama_lora.ipynb
 
@@ -38,4 +30,10 @@ Colab上的Gradio演示示例。
 
 Example of running the Gradio demo on Colab.
 
-在Colab中打开 / Open the notebook in Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ymcui/Chinese-LLaMA-Alpaca/blob/main/notebooks/gradio_web_demo.ipynb) 
\ No newline at end of file
+在Colab中打开 / Open the notebook in Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ymcui/Chinese-LLaMA-Alpaca/blob/main/notebooks/gradio_web_demo.ipynb) 
+
+###  legacy/
+
+旧版notebook，供参考，但不会再更新。
+
+Old notebook. Reference only, will not be updated.
\ No newline at end of file
diff --git a/notebooks/convert_and_quantize_chinese_alpaca_plus.ipynb b/notebooks/convert_and_quantize_chinese_alpaca_plus.ipynb
deleted file mode 100644
index b3bf1e3..0000000
--- a/notebooks/convert_and_quantize_chinese_alpaca_plus.ipynb
+++ /dev/null
@@ -1,1171 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "B1c96_k3MahN"
-      },
-      "source": [
-        "# 转换并量化中文Alpaca Plus模型\n",
-        "\n",
-        "关于其他模型请参考另一个notebook：https://colab.research.google.com/drive/1Eak6azD3MLeb-YsfbP8UZC8wrL1ddIMI?usp=sharing\n",
-        "\n",
-        "\n",
-        "🎉🎉🎉 **新：现在免费用户也有机会能够转换7B和13B模型了！**\n",
-        "\n",
-        "💡 提示和小窍门：\n",
-        "- 免费用户默认的内存只有12G左右，**笔者用免费账号实测选择TPU的话有机会随机出35G内存**，建议多试几次。如果能随机出25G内存以上的机器就可以了转换7B模型了，35G内存以上机器就能转换13B模型了\n",
-        "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n",
-        "- 实测：转换7B级别模型，25G内存的机器就够了；转换13B级别模型需要30G以上的内存（程序莫名崩掉或断开连接就说明内存爆了）\n",
-        "- 如果选了“高RAM”之后内存还是不够大的话，选择以下操作，有的时候会分配出很高内存的机器，祝你好运😄！\n",
-        "    - 可以把GPU或者TPU也选上（虽然不会用到）\n",
-        "    - 选GPU时，Pro用户可选“高级”类型GPU\n",
-        "\n",
-        "以下信息配置信息供参考（Pro订阅下测试），运行时规格设置为“高RAM”时的设备配置如下（有随机性）：\n",
-        "\n",
-        "| 硬件加速器  |  RAM  |  硬盘  |\n",
-        "| :-- | :--: | :--: |\n",
-        "| None | 25GB | 225GB |\n",
-        "| TPU | 35GB | 225GB |\n",
-        "| GPU（标准，T4）| 25GB | 166GB |\n",
-        "| GPU（高性能，V100）| 25GB | 166GB |\n",
-        "| GPU（高性能，A100）| **80GB** | 166GB |\n",
-        "\n",
-        "*温馨提示：用完之后注意断开运行时，选择满足要求的最低配置即可，避免不必要的计算单元消耗（Pro只给100个计算单元）。*"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vScqHD_jMFOV"
-      },
-      "source": [
-        "## 安装相关依赖"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "E5WKFJXIL6ZU",
-        "outputId": "87a89bed-053e-4e61-e2f8-1dfcbdf87fbf"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
-            "Collecting torch==1.12.0\n",
-            "  Downloading torch-1.12.0-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m776.3/776.3 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==1.12.0) (4.5.0)\n",
-            "Installing collected packages: torch\n",
-            "  Attempting uninstall: torch\n",
-            "    Found existing installation: torch 2.0.0+cu118\n",
-            "    Uninstalling torch-2.0.0+cu118:\n",
-            "      Successfully uninstalled torch-2.0.0+cu118\n",
-            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
-            "torchvision 0.15.1+cu118 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n",
-            "torchtext 0.15.1 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n",
-            "torchdata 0.6.0 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n",
-            "torchaudio 2.0.1+cu118 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n",
-            "peft 0.2.0 requires torch>=1.13.0, but you have torch 1.12.0 which is incompatible.\u001b[0m\u001b[31m\n",
-            "\u001b[0mSuccessfully installed torch-1.12.0\n",
-            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
-            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.28.1)\n",
-            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n",
-            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n",
-            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
-            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n",
-            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n",
-            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n",
-            "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n",
-            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n",
-            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (2023.4.0)\n",
-            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
-            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
-            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n",
-            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n",
-            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
-            "Collecting git+https://github.com/huggingface/peft\n",
-            "  Cloning https://github.com/huggingface/peft to /tmp/pip-req-build-tnxzt7q0\n",
-            "  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft /tmp/pip-req-build-tnxzt7q0\n",
-            "  Resolved https://github.com/huggingface/peft to commit 632997d1fb776c3cf05d8c2537ac9a98a7ce9435\n",
-            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
-            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
-            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (23.1)\n",
-            "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (0.18.0)\n",
-            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (1.22.4)\n",
-            "Collecting torch>=1.13.0\n",
-            "  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m619.9/619.9 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (6.0)\n",
-            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (5.9.5)\n",
-            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (4.28.1)\n",
-            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1)\n",
-            "Collecting nvidia-cufft-cu11==10.9.0.58\n",
-            "  Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.4/168.4 MB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting nvidia-cudnn-cu11==8.5.0.96\n",
-            "  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m557.1/557.1 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (2.0.0)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.12.0)\n",
-            "Collecting nvidia-cuda-runtime-cu11==11.7.99\n",
-            "  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m849.3/849.3 kB\u001b[0m \u001b[31m48.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1.2)\n",
-            "Collecting nvidia-nccl-cu11==2.14.3\n",
-            "  Downloading nvidia_nccl_cu11-2.14.3-py3-none-manylinux1_x86_64.whl (177.1 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.1/177.1 MB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (1.11.1)\n",
-            "Collecting nvidia-cusparse-cu11==11.7.4.91\n",
-            "  Downloading nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m173.2/173.2 MB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting nvidia-cublas-cu11==11.10.3.66\n",
-            "  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.1/317.1 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting nvidia-nvtx-cu11==11.7.91\n",
-            "  Downloading nvidia_nvtx_cu11-11.7.91-py3-none-manylinux1_x86_64.whl (98 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.6/98.6 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (4.5.0)\n",
-            "Collecting nvidia-curand-cu11==10.2.10.91\n",
-            "  Downloading nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.6/54.6 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting nvidia-cusolver-cu11==11.4.0.1\n",
-            "  Downloading nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.6/102.6 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99\n",
-            "  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.0/21.0 MB\u001b[0m \u001b[31m63.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101\n",
-            "  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.8/11.8 MB\u001b[0m \u001b[31m75.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0.dev0) (0.40.0)\n",
-            "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0.dev0) (67.7.2)\n",
-            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (3.25.2)\n",
-            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (16.0.2)\n",
-            "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.14.1)\n",
-            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2022.10.31)\n",
-            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.13.3)\n",
-            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (4.65.0)\n",
-            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2.27.1)\n",
-            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers->peft==0.3.0.dev0) (2023.4.0)\n",
-            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.13.0->peft==0.3.0.dev0) (2.1.2)\n",
-            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2.0.12)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2022.12.7)\n",
-            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (3.4)\n",
-            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (1.26.15)\n",
-            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.13.0->peft==0.3.0.dev0) (1.3.0)\n",
-            "Building wheels for collected packages: peft\n",
-            "  Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for peft: filename=peft-0.3.0.dev0-py3-none-any.whl size=55537 sha256=3cc2a65c09926ac217ac671b7d9c1640eac9857f0aca55b78a9fcda484263073\n",
-            "  Stored in directory: /tmp/pip-ephem-wheel-cache-1rjlvx70/wheels/4c/16/67/1002a2d4daa822eff130e6d85b90051b75d2ce0d26b9448e4a\n",
-            "Successfully built peft\n",
-            "Installing collected packages: nvidia-nvtx-cu11, nvidia-nccl-cu11, nvidia-cusparse-cu11, nvidia-curand-cu11, nvidia-cufft-cu11, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-cupti-cu11, nvidia-cublas-cu11, nvidia-cusolver-cu11, nvidia-cudnn-cu11, torch, peft\n",
-            "  Attempting uninstall: torch\n",
-            "    Found existing installation: torch 1.12.0\n",
-            "    Uninstalling torch-1.12.0:\n",
-            "      Successfully uninstalled torch-1.12.0\n",
-            "  Attempting uninstall: peft\n",
-            "    Found existing installation: peft 0.2.0\n",
-            "    Uninstalling peft-0.2.0:\n",
-            "      Successfully uninstalled peft-0.2.0\n",
-            "Successfully installed nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-cupti-cu11-11.7.101 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.2.10.91 nvidia-cusolver-cu11-11.4.0.1 nvidia-cusparse-cu11-11.7.4.91 nvidia-nccl-cu11-2.14.3 nvidia-nvtx-cu11-11.7.91 peft-0.3.0.dev0 torch-2.0.0\n",
-            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
-            "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.98)\n"
-          ]
-        }
-      ],
-      "source": [
-        "!pip install torch==1.12.0\n",
-        "!pip install transformers\n",
-        "!pip install git+https://github.com/huggingface/peft\n",
-        "!pip install sentencepiece"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ygb1xFIMNQKw"
-      },
-      "source": [
-        "## 克隆目录和代码"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "yCEJh7NJNXz9",
-        "outputId": "ec16f31b-7af7-4eb8-82ce-5f9317bad941"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Cloning into 'Chinese-LLaMA-Alpaca'...\n",
-            "remote: Enumerating objects: 761, done.\u001b[K\n",
-            "remote: Counting objects: 100% (202/202), done.\u001b[K\n",
-            "remote: Compressing objects: 100% (172/172), done.\u001b[K\n",
-            "remote: Total 761 (delta 54), reused 69 (delta 29), pack-reused 559\u001b[K\n",
-            "Receiving objects: 100% (761/761), 11.16 MiB | 22.49 MiB/s, done.\n",
-            "Resolving deltas: 100% (444/444), done.\n",
-            "Cloning into 'llama.cpp'...\n",
-            "remote: Enumerating objects: 2086, done.\u001b[K\n",
-            "remote: Counting objects: 100% (842/842), done.\u001b[K\n",
-            "remote: Compressing objects: 100% (99/99), done.\u001b[K\n",
-            "remote: Total 2086 (delta 778), reused 756 (delta 743), pack-reused 1244\u001b[K\n",
-            "Receiving objects: 100% (2086/2086), 2.12 MiB | 16.33 MiB/s, done.\n",
-            "Resolving deltas: 100% (1345/1345), done.\n"
-          ]
-        }
-      ],
-      "source": [
-        "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n",
-        "!git clone https://github.com/ggerganov/llama.cpp"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nIyxX0DSNsgQ"
-      },
-      "source": [
-        "## 合并模型（Alpaca-Plus-7B）\n",
-        "\n",
-        "**⚠️ 再次提醒：7B模型需要25G内存，13B模型需要35G+内存。**\n",
-        "\n",
-        "此处使用的是🤗模型库中提供的基模型（已是HF格式），而不是Facebook官方的LLaMA模型，因此略去将原版LLaMA转换为HF格式的步骤。\n",
-        "\n",
-        "**这里直接运行第二步：合并LoRA权重**，生成全量模型权重。可以直接指定🤗模型库的地址，也可以是本地存放地址。\n",
-        "- 基模型：`decapoda-research/llama-7b-hf` *（use at your own risk）*\n",
-        "- LoRA模型：先写`ziqingyang/chinese-llama-plus-lora-7b`然后再写`ziqingyang/chinese-alpaca-plus-lora-7b`\n",
-        "- 输出类型：因为后续要量化，这里将`output_type`设置为`pth`\n",
-        "\n",
-        "💡 转换13B模型提示：\n",
-        "- 请将参数`--base_model`和`--lora_model`中的的`7b`改为`13b`即可\n",
-        "- **免费用户必须增加一个参数`--offload_dir`以缓解内存压力**，例如`--offload_dir ./offload_temp`\n",
-        "\n",
-        "该过程比较耗时（下载+转换），需要几分钟到十几分钟不等，请耐心等待。\n",
-        "转换好的模型存放在`alpaca-combined`目录。\n",
-        "如果你不需要量化模型，那么到这一步就结束了。"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "5AV4EW5hNhVV",
-        "outputId": "91901b82-88c4-405d-cf86-32f1a3a60467"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "2023-04-28 08:07:00.276520: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-            "Base model: decapoda-research/llama-7b-hf\n",
-            "LoRA model(s) ['ziqingyang/chinese-llama-plus-lora-7b', 'ziqingyang/chinese-alpaca-plus-lora-7b']:\n",
-            "Loading checkpoint shards: 100% 33/33 [01:18<00:00,  2.39s/it]\n",
-            "Peft version: 0.3.0.dev0\n",
-            "Loading LoRA for 7B model\n",
-            "Loading LoRA ziqingyang/chinese-llama-plus-lora-7b\n",
-            "Extended vocabulary size to 49953\n",
-            "Downloading (…)/adapter_config.json: 100% 420/420 [00:00<00:00, 1.61MB/s]\n",
-            "Downloading adapter_model.bin: 100% 858M/858M [00:04<00:00, 185MB/s]\n",
-            "Merging with merge_and_unload...\n",
-            "Loading LoRA ziqingyang/chinese-alpaca-plus-lora-7b\n",
-            "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 13.4MB/s]\n",
-            "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 535kB/s]\n",
-            "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 854kB/s]\n",
-            "Extended vocabulary size to 49954\n",
-            "Downloading (…)/adapter_config.json: 100% 423/423 [00:00<00:00, 2.31MB/s]\n",
-            "Downloading adapter_model.bin: 100% 1.14G/1.14G [00:16<00:00, 70.6MB/s]\n",
-            "Merging with merge_and_unload...\n",
-            "Saving to pth format...\n",
-            "Saving shard 1 of 1 into alpaca-combined/consolidated.00.pth\n"
-          ]
-        }
-      ],
-      "source": [
-        "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora.py \\\n",
-        "    --base_model decapoda-research/llama-7b-hf \\\n",
-        "    --lora_model ziqingyang/chinese-llama-plus-lora-7b,ziqingyang/chinese-alpaca-plus-lora-7b \\\n",
-        "    --output_type pth \\\n",
-        "    --output_dir alpaca-combined"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ueexcKo-Q_EW"
-      },
-      "source": [
-        "## 量化模型\n",
-        "接下来我们使用[llama.cpp](https://github.com/ggerganov/llama.cpp)工具对上一步生成的全量版本权重进行转换，生成4-bit量化模型。\n",
-        "\n",
-        "### 编译工具\n",
-        "\n",
-        "首先对llama.cpp工具进行编译。"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "_GbjsT2wRRCR",
-        "outputId": "2b4f2a38-d22d-4764-9a81-bad8bd72b7fe"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "I llama.cpp build info: \n",
-            "I UNAME_S:  Linux\n",
-            "I UNAME_P:  x86_64\n",
-            "I UNAME_M:  x86_64\n",
-            "I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native\n",
-            "I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native\n",
-            "I LDFLAGS:  \n",
-            "I CC:       cc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
-            "I CXX:      g++ (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
-            "\n",
-            "cc  -I.              -O3 -DNDEBUG -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native   -c ggml.c -o ggml.o\n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c llama.cpp -o llama.o\n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c examples/common.cpp -o common.o\n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/main/main.cpp ggml.o llama.o common.o -o main \n",
-            "\n",
-            "====  Run ./main -h for help.  ====\n",
-            "\n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize/quantize.cpp ggml.o llama.o -o quantize \n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats \n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity \n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding \n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native pocs/vdot/vdot.cpp ggml.o -o vdot \n"
-          ]
-        }
-      ],
-      "source": [
-        "!cd llama.cpp && make"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gw2xpYC0RcQC"
-      },
-      "source": [
-        "### 模型转换为ggml格式（FP16）\n",
-        "\n",
-        "这一步，我们将模型转换为ggml格式（FP16）。\n",
-        "- 在这之前需要把`alpaca-combined`目录挪个位置，把模型文件放到`llama.cpp/zh-models/7B`下，把`tokenizer.model`放到`llama.cpp/zh-models`\n",
-        "- tokenizer在哪里？\n",
-        "    - `alpaca-combined`目录下有\n",
-        "    - 或者从以下网址下载：https://huggingface.co/ziqingyang/chinese-alpaca-lora-7b/resolve/main/tokenizer.model （注意，Alpaca和LLaMA的`tokenizer.model`不能混用！）\n",
-        "\n",
-        "💡 转换13B模型提示：\n",
-        "- tokenizer可以直接用7B的，13B和7B的相同\n",
-        "- Alpaca和LLaMA的`tokenizer.model`不能混用！\n",
-        "- 以下看到7B字样的都是文件夹名，与转换过程没有关系了，改不改都行"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "5KgnFVStRjio",
-        "outputId": "19293a4a-a400-4cd3-c98b-80022dcd1f35"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "7B  tokenizer.model\n"
-          ]
-        }
-      ],
-      "source": [
-        "!cd llama.cpp && mkdir zh-models && mv ../alpaca-combined zh-models/7B\n",
-        "!mv llama.cpp/zh-models/7B/tokenizer.model llama.cpp/zh-models/\n",
-        "!ls llama.cpp/zh-models/"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "NUHeoTMQS1AQ",
-        "outputId": "378b70db-d13b-4aa9-8bb0-a1fc1cd4b13f"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Loading model file zh-models/7B/consolidated.00.pth\n",
-            "Loading vocab file zh-models/tokenizer.model\n",
-            "Writing vocab...\n",
-            "[  1/291] Writing tensor tok_embeddings.weight                  | size  49954 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[  2/291] Writing tensor norm.weight                            | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[  3/291] Writing tensor output.weight                          | size  49954 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[  4/291] Writing tensor layers.0.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[  5/291] Writing tensor layers.0.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[  6/291] Writing tensor layers.0.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[  7/291] Writing tensor layers.0.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[  8/291] Writing tensor layers.0.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[  9/291] Writing tensor layers.0.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 10/291] Writing tensor layers.0.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 11/291] Writing tensor layers.0.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 12/291] Writing tensor layers.0.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 13/291] Writing tensor layers.1.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 14/291] Writing tensor layers.1.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 15/291] Writing tensor layers.1.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 16/291] Writing tensor layers.1.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 17/291] Writing tensor layers.1.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 18/291] Writing tensor layers.1.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 19/291] Writing tensor layers.1.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 20/291] Writing tensor layers.1.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 21/291] Writing tensor layers.1.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 22/291] Writing tensor layers.2.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 23/291] Writing tensor layers.2.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 24/291] Writing tensor layers.2.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 25/291] Writing tensor layers.2.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 26/291] Writing tensor layers.2.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 27/291] Writing tensor layers.2.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 28/291] Writing tensor layers.2.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 29/291] Writing tensor layers.2.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 30/291] Writing tensor layers.2.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 31/291] Writing tensor layers.3.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 32/291] Writing tensor layers.3.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 33/291] Writing tensor layers.3.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 34/291] Writing tensor layers.3.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 35/291] Writing tensor layers.3.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 36/291] Writing tensor layers.3.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 37/291] Writing tensor layers.3.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 38/291] Writing tensor layers.3.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 39/291] Writing tensor layers.3.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 40/291] Writing tensor layers.4.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 41/291] Writing tensor layers.4.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 42/291] Writing tensor layers.4.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 43/291] Writing tensor layers.4.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 44/291] Writing tensor layers.4.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 45/291] Writing tensor layers.4.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 46/291] Writing tensor layers.4.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 47/291] Writing tensor layers.4.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 48/291] Writing tensor layers.4.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 49/291] Writing tensor layers.5.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 50/291] Writing tensor layers.5.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 51/291] Writing tensor layers.5.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 52/291] Writing tensor layers.5.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 53/291] Writing tensor layers.5.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 54/291] Writing tensor layers.5.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 55/291] Writing tensor layers.5.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 56/291] Writing tensor layers.5.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 57/291] Writing tensor layers.5.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 58/291] Writing tensor layers.6.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 59/291] Writing tensor layers.6.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 60/291] Writing tensor layers.6.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 61/291] Writing tensor layers.6.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 62/291] Writing tensor layers.6.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 63/291] Writing tensor layers.6.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 64/291] Writing tensor layers.6.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 65/291] Writing tensor layers.6.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 66/291] Writing tensor layers.6.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 67/291] Writing tensor layers.7.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 68/291] Writing tensor layers.7.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 69/291] Writing tensor layers.7.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 70/291] Writing tensor layers.7.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 71/291] Writing tensor layers.7.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 72/291] Writing tensor layers.7.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 73/291] Writing tensor layers.7.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 74/291] Writing tensor layers.7.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 75/291] Writing tensor layers.7.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 76/291] Writing tensor layers.8.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 77/291] Writing tensor layers.8.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 78/291] Writing tensor layers.8.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 79/291] Writing tensor layers.8.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 80/291] Writing tensor layers.8.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 81/291] Writing tensor layers.8.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 82/291] Writing tensor layers.8.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 83/291] Writing tensor layers.8.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 84/291] Writing tensor layers.8.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 85/291] Writing tensor layers.9.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 86/291] Writing tensor layers.9.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 87/291] Writing tensor layers.9.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 88/291] Writing tensor layers.9.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 89/291] Writing tensor layers.9.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 90/291] Writing tensor layers.9.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 91/291] Writing tensor layers.9.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[ 92/291] Writing tensor layers.9.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 93/291] Writing tensor layers.9.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 94/291] Writing tensor layers.10.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 95/291] Writing tensor layers.10.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 96/291] Writing tensor layers.10.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 97/291] Writing tensor layers.10.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[ 98/291] Writing tensor layers.10.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[ 99/291] Writing tensor layers.10.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[100/291] Writing tensor layers.10.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[101/291] Writing tensor layers.10.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[102/291] Writing tensor layers.10.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[103/291] Writing tensor layers.11.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[104/291] Writing tensor layers.11.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[105/291] Writing tensor layers.11.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[106/291] Writing tensor layers.11.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[107/291] Writing tensor layers.11.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[108/291] Writing tensor layers.11.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[109/291] Writing tensor layers.11.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[110/291] Writing tensor layers.11.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[111/291] Writing tensor layers.11.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[112/291] Writing tensor layers.12.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[113/291] Writing tensor layers.12.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[114/291] Writing tensor layers.12.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[115/291] Writing tensor layers.12.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[116/291] Writing tensor layers.12.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[117/291] Writing tensor layers.12.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[118/291] Writing tensor layers.12.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[119/291] Writing tensor layers.12.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[120/291] Writing tensor layers.12.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[121/291] Writing tensor layers.13.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[122/291] Writing tensor layers.13.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[123/291] Writing tensor layers.13.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[124/291] Writing tensor layers.13.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[125/291] Writing tensor layers.13.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[126/291] Writing tensor layers.13.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[127/291] Writing tensor layers.13.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[128/291] Writing tensor layers.13.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[129/291] Writing tensor layers.13.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[130/291] Writing tensor layers.14.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[131/291] Writing tensor layers.14.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[132/291] Writing tensor layers.14.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[133/291] Writing tensor layers.14.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[134/291] Writing tensor layers.14.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[135/291] Writing tensor layers.14.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[136/291] Writing tensor layers.14.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[137/291] Writing tensor layers.14.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[138/291] Writing tensor layers.14.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[139/291] Writing tensor layers.15.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[140/291] Writing tensor layers.15.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[141/291] Writing tensor layers.15.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[142/291] Writing tensor layers.15.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[143/291] Writing tensor layers.15.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[144/291] Writing tensor layers.15.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[145/291] Writing tensor layers.15.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[146/291] Writing tensor layers.15.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[147/291] Writing tensor layers.15.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[148/291] Writing tensor layers.16.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[149/291] Writing tensor layers.16.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[150/291] Writing tensor layers.16.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[151/291] Writing tensor layers.16.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[152/291] Writing tensor layers.16.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[153/291] Writing tensor layers.16.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[154/291] Writing tensor layers.16.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[155/291] Writing tensor layers.16.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[156/291] Writing tensor layers.16.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[157/291] Writing tensor layers.17.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[158/291] Writing tensor layers.17.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[159/291] Writing tensor layers.17.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[160/291] Writing tensor layers.17.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[161/291] Writing tensor layers.17.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[162/291] Writing tensor layers.17.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[163/291] Writing tensor layers.17.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[164/291] Writing tensor layers.17.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[165/291] Writing tensor layers.17.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[166/291] Writing tensor layers.18.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[167/291] Writing tensor layers.18.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[168/291] Writing tensor layers.18.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[169/291] Writing tensor layers.18.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[170/291] Writing tensor layers.18.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[171/291] Writing tensor layers.18.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[172/291] Writing tensor layers.18.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[173/291] Writing tensor layers.18.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[174/291] Writing tensor layers.18.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[175/291] Writing tensor layers.19.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[176/291] Writing tensor layers.19.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[177/291] Writing tensor layers.19.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[178/291] Writing tensor layers.19.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[179/291] Writing tensor layers.19.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[180/291] Writing tensor layers.19.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[181/291] Writing tensor layers.19.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[182/291] Writing tensor layers.19.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[183/291] Writing tensor layers.19.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[184/291] Writing tensor layers.20.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[185/291] Writing tensor layers.20.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[186/291] Writing tensor layers.20.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[187/291] Writing tensor layers.20.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[188/291] Writing tensor layers.20.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[189/291] Writing tensor layers.20.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[190/291] Writing tensor layers.20.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[191/291] Writing tensor layers.20.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[192/291] Writing tensor layers.20.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[193/291] Writing tensor layers.21.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[194/291] Writing tensor layers.21.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[195/291] Writing tensor layers.21.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[196/291] Writing tensor layers.21.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[197/291] Writing tensor layers.21.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[198/291] Writing tensor layers.21.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[199/291] Writing tensor layers.21.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[200/291] Writing tensor layers.21.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[201/291] Writing tensor layers.21.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[202/291] Writing tensor layers.22.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[203/291] Writing tensor layers.22.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[204/291] Writing tensor layers.22.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[205/291] Writing tensor layers.22.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[206/291] Writing tensor layers.22.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[207/291] Writing tensor layers.22.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[208/291] Writing tensor layers.22.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[209/291] Writing tensor layers.22.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[210/291] Writing tensor layers.22.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[211/291] Writing tensor layers.23.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[212/291] Writing tensor layers.23.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[213/291] Writing tensor layers.23.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[214/291] Writing tensor layers.23.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[215/291] Writing tensor layers.23.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[216/291] Writing tensor layers.23.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[217/291] Writing tensor layers.23.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[218/291] Writing tensor layers.23.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[219/291] Writing tensor layers.23.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[220/291] Writing tensor layers.24.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[221/291] Writing tensor layers.24.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[222/291] Writing tensor layers.24.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[223/291] Writing tensor layers.24.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[224/291] Writing tensor layers.24.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[225/291] Writing tensor layers.24.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[226/291] Writing tensor layers.24.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[227/291] Writing tensor layers.24.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[228/291] Writing tensor layers.24.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[229/291] Writing tensor layers.25.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[230/291] Writing tensor layers.25.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[231/291] Writing tensor layers.25.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[232/291] Writing tensor layers.25.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[233/291] Writing tensor layers.25.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[234/291] Writing tensor layers.25.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[235/291] Writing tensor layers.25.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[236/291] Writing tensor layers.25.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[237/291] Writing tensor layers.25.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[238/291] Writing tensor layers.26.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[239/291] Writing tensor layers.26.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[240/291] Writing tensor layers.26.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[241/291] Writing tensor layers.26.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[242/291] Writing tensor layers.26.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[243/291] Writing tensor layers.26.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[244/291] Writing tensor layers.26.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[245/291] Writing tensor layers.26.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[246/291] Writing tensor layers.26.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[247/291] Writing tensor layers.27.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[248/291] Writing tensor layers.27.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[249/291] Writing tensor layers.27.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[250/291] Writing tensor layers.27.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[251/291] Writing tensor layers.27.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[252/291] Writing tensor layers.27.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[253/291] Writing tensor layers.27.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[254/291] Writing tensor layers.27.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[255/291] Writing tensor layers.27.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[256/291] Writing tensor layers.28.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[257/291] Writing tensor layers.28.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[258/291] Writing tensor layers.28.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[259/291] Writing tensor layers.28.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[260/291] Writing tensor layers.28.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[261/291] Writing tensor layers.28.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[262/291] Writing tensor layers.28.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[263/291] Writing tensor layers.28.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[264/291] Writing tensor layers.28.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[265/291] Writing tensor layers.29.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[266/291] Writing tensor layers.29.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[267/291] Writing tensor layers.29.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[268/291] Writing tensor layers.29.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[269/291] Writing tensor layers.29.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[270/291] Writing tensor layers.29.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[271/291] Writing tensor layers.29.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[272/291] Writing tensor layers.29.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[273/291] Writing tensor layers.29.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[274/291] Writing tensor layers.30.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[275/291] Writing tensor layers.30.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[276/291] Writing tensor layers.30.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[277/291] Writing tensor layers.30.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[278/291] Writing tensor layers.30.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[279/291] Writing tensor layers.30.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[280/291] Writing tensor layers.30.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[281/291] Writing tensor layers.30.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[282/291] Writing tensor layers.30.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[283/291] Writing tensor layers.31.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[284/291] Writing tensor layers.31.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[285/291] Writing tensor layers.31.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[286/291] Writing tensor layers.31.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[287/291] Writing tensor layers.31.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "[288/291] Writing tensor layers.31.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[289/291] Writing tensor layers.31.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
-            "[290/291] Writing tensor layers.31.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
-            "[291/291] Writing tensor layers.31.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
-            "Wrote zh-models/7B/ggml-model-f16.bin\n"
-          ]
-        }
-      ],
-      "source": [
-        "!cd llama.cpp && python convert.py zh-models/7B/"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hEZEJAVYCHkc"
-      },
-      "source": [
-        "### 将FP16模型量化为8-bit\n",
-        "\n",
-        "我们进一步将FP16模型转换为8-bit量化模型。"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "2xyais7OUVDI",
-        "outputId": "b7fe3c62-489a-42e5-927a-8ab6088a3ecc"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "llama.cpp: loading model from ./zh-models/7B/ggml-model-f16.bin\n",
-            "llama.cpp: saving model to ./zh-models/7B/ggml-model-q4_0.bin\n",
-            "[   1/ 291]                tok_embeddings.weight -     4096 x 49954, type =    f16, quantizing .. size =   390.27 MB ->   219.52 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[   2/ 291]                          norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[   3/ 291]                        output.weight -     4096 x 49954, type =    f16, quantizing .. size =   390.27 MB ->   219.52 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[   4/ 291]         layers.0.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.026 0.018 0.028 0.044 0.064 0.088 0.111 0.245 0.111 0.087 0.064 0.044 0.028 0.018 0.026 \n",
-            "[   5/ 291]         layers.0.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.026 0.017 0.028 0.043 0.063 0.087 0.111 0.250 0.112 0.087 0.063 0.043 0.028 0.017 0.026 \n",
-            "[   6/ 291]         layers.0.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.019 0.031 0.046 0.065 0.087 0.107 0.237 0.107 0.087 0.065 0.046 0.030 0.019 0.027 \n",
-            "[   7/ 291]         layers.0.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.026 0.017 0.027 0.042 0.062 0.087 0.113 0.253 0.113 0.087 0.062 0.042 0.027 0.017 0.026 \n",
-            "[   8/ 291]       layers.0.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[   9/ 291]      layers.0.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  10/ 291]      layers.0.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  11/ 291]      layers.0.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[  12/ 291]             layers.0.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  13/ 291]         layers.1.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.107 0.228 0.107 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  14/ 291]         layers.1.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.019 0.031 0.047 0.067 0.088 0.107 0.229 0.107 0.088 0.067 0.047 0.031 0.019 0.027 \n",
-            "[  15/ 291]         layers.1.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.019 0.030 0.046 0.066 0.088 0.108 0.235 0.108 0.088 0.065 0.046 0.030 0.019 0.027 \n",
-            "[  16/ 291]         layers.1.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.026 0.017 0.027 0.042 0.062 0.087 0.113 0.256 0.113 0.086 0.062 0.042 0.027 0.017 0.026 \n",
-            "[  17/ 291]       layers.1.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  18/ 291]      layers.1.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  19/ 291]      layers.1.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  20/ 291]      layers.1.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  21/ 291]             layers.1.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  22/ 291]         layers.2.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  23/ 291]         layers.2.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.019 0.031 0.047 0.066 0.088 0.107 0.231 0.107 0.088 0.066 0.047 0.031 0.019 0.027 \n",
-            "[  24/ 291]         layers.2.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.087 0.106 0.228 0.106 0.087 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  25/ 291]         layers.2.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.107 0.228 0.107 0.088 0.067 0.047 0.031 0.019 0.027 \n",
-            "[  26/ 291]       layers.2.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  27/ 291]      layers.2.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  28/ 291]      layers.2.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  29/ 291]      layers.2.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  30/ 291]             layers.2.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  31/ 291]         layers.3.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  32/ 291]         layers.3.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.229 0.106 0.088 0.066 0.047 0.031 0.020 0.027 \n",
-            "[  33/ 291]         layers.3.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[  34/ 291]         layers.3.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  35/ 291]       layers.3.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  36/ 291]      layers.3.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  37/ 291]      layers.3.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  38/ 291]      layers.3.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  39/ 291]             layers.3.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  40/ 291]         layers.4.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  41/ 291]         layers.4.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  42/ 291]         layers.4.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  43/ 291]         layers.4.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  44/ 291]       layers.4.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  45/ 291]      layers.4.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  46/ 291]      layers.4.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  47/ 291]      layers.4.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  48/ 291]             layers.4.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  49/ 291]         layers.5.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  50/ 291]         layers.5.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[  51/ 291]         layers.5.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[  52/ 291]         layers.5.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  53/ 291]       layers.5.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  54/ 291]      layers.5.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  55/ 291]      layers.5.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  56/ 291]      layers.5.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  57/ 291]             layers.5.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  58/ 291]         layers.6.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  59/ 291]         layers.6.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n",
-            "[  60/ 291]         layers.6.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[  61/ 291]         layers.6.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  62/ 291]       layers.6.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  63/ 291]      layers.6.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  64/ 291]      layers.6.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  65/ 291]      layers.6.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  66/ 291]             layers.6.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  67/ 291]         layers.7.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  68/ 291]         layers.7.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  69/ 291]         layers.7.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  70/ 291]         layers.7.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  71/ 291]       layers.7.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  72/ 291]      layers.7.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  73/ 291]      layers.7.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n",
-            "[  74/ 291]      layers.7.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  75/ 291]             layers.7.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  76/ 291]         layers.8.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  77/ 291]         layers.8.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n",
-            "[  78/ 291]         layers.8.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  79/ 291]         layers.8.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  80/ 291]       layers.8.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  81/ 291]      layers.8.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  82/ 291]      layers.8.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  83/ 291]      layers.8.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  84/ 291]             layers.8.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  85/ 291]         layers.9.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  86/ 291]         layers.9.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  87/ 291]         layers.9.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[  88/ 291]         layers.9.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  89/ 291]       layers.9.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  90/ 291]      layers.9.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  91/ 291]      layers.9.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  92/ 291]      layers.9.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  93/ 291]             layers.9.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  94/ 291]        layers.10.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  95/ 291]        layers.10.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  96/ 291]        layers.10.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[  97/ 291]        layers.10.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[  98/ 291]      layers.10.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[  99/ 291]     layers.10.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 100/ 291]     layers.10.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 101/ 291]     layers.10.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 102/ 291]            layers.10.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 103/ 291]        layers.11.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 104/ 291]        layers.11.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 105/ 291]        layers.11.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 106/ 291]        layers.11.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 107/ 291]      layers.11.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 108/ 291]     layers.11.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 109/ 291]     layers.11.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 110/ 291]     layers.11.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 111/ 291]            layers.11.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 112/ 291]        layers.12.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 113/ 291]        layers.12.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 114/ 291]        layers.12.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 115/ 291]        layers.12.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 116/ 291]      layers.12.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 117/ 291]     layers.12.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 118/ 291]     layers.12.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 119/ 291]     layers.12.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 120/ 291]            layers.12.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 121/ 291]        layers.13.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 122/ 291]        layers.13.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 123/ 291]        layers.13.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 124/ 291]        layers.13.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 125/ 291]      layers.13.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 126/ 291]     layers.13.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 127/ 291]     layers.13.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 128/ 291]     layers.13.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 129/ 291]            layers.13.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 130/ 291]        layers.14.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 131/ 291]        layers.14.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 132/ 291]        layers.14.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 133/ 291]        layers.14.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 134/ 291]      layers.14.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 135/ 291]     layers.14.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 136/ 291]     layers.14.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 137/ 291]     layers.14.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 138/ 291]            layers.14.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 139/ 291]        layers.15.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 140/ 291]        layers.15.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 141/ 291]        layers.15.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[ 142/ 291]        layers.15.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 143/ 291]      layers.15.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 144/ 291]     layers.15.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 145/ 291]     layers.15.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 146/ 291]     layers.15.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 147/ 291]            layers.15.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 148/ 291]        layers.16.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 149/ 291]        layers.16.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 150/ 291]        layers.16.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[ 151/ 291]        layers.16.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 152/ 291]      layers.16.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 153/ 291]     layers.16.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 154/ 291]     layers.16.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 155/ 291]     layers.16.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 156/ 291]            layers.16.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 157/ 291]        layers.17.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 158/ 291]        layers.17.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 159/ 291]        layers.17.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 160/ 291]        layers.17.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 161/ 291]      layers.17.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 162/ 291]     layers.17.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 163/ 291]     layers.17.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n",
-            "[ 164/ 291]     layers.17.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 165/ 291]            layers.17.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 166/ 291]        layers.18.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 167/ 291]        layers.18.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 168/ 291]        layers.18.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 169/ 291]        layers.18.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 170/ 291]      layers.18.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 171/ 291]     layers.18.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 172/ 291]     layers.18.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 173/ 291]     layers.18.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 174/ 291]            layers.18.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 175/ 291]        layers.19.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 176/ 291]        layers.19.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 177/ 291]        layers.19.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[ 178/ 291]        layers.19.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 179/ 291]      layers.19.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 180/ 291]     layers.19.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 181/ 291]     layers.19.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 182/ 291]     layers.19.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 183/ 291]            layers.19.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 184/ 291]        layers.20.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 185/ 291]        layers.20.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 186/ 291]        layers.20.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 187/ 291]        layers.20.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 188/ 291]      layers.20.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 189/ 291]     layers.20.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 190/ 291]     layers.20.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 191/ 291]     layers.20.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 192/ 291]            layers.20.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 193/ 291]        layers.21.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 194/ 291]        layers.21.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 195/ 291]        layers.21.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 196/ 291]        layers.21.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 197/ 291]      layers.21.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 198/ 291]     layers.21.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 199/ 291]     layers.21.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 200/ 291]     layers.21.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 201/ 291]            layers.21.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 202/ 291]        layers.22.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 203/ 291]        layers.22.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 204/ 291]        layers.22.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[ 205/ 291]        layers.22.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 206/ 291]      layers.22.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 207/ 291]     layers.22.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 208/ 291]     layers.22.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 209/ 291]     layers.22.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 210/ 291]            layers.22.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 211/ 291]        layers.23.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 212/ 291]        layers.23.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 213/ 291]        layers.23.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 214/ 291]        layers.23.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 215/ 291]      layers.23.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 216/ 291]     layers.23.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 217/ 291]     layers.23.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 218/ 291]     layers.23.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 219/ 291]            layers.23.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 220/ 291]        layers.24.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 221/ 291]        layers.24.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 222/ 291]        layers.24.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 223/ 291]        layers.24.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 224/ 291]      layers.24.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 225/ 291]     layers.24.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 226/ 291]     layers.24.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 227/ 291]     layers.24.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 228/ 291]            layers.24.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 229/ 291]        layers.25.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 230/ 291]        layers.25.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 231/ 291]        layers.25.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 232/ 291]        layers.25.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 233/ 291]      layers.25.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 234/ 291]     layers.25.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 235/ 291]     layers.25.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 236/ 291]     layers.25.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 237/ 291]            layers.25.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 238/ 291]        layers.26.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 239/ 291]        layers.26.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 240/ 291]        layers.26.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 241/ 291]        layers.26.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 242/ 291]      layers.26.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 243/ 291]     layers.26.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.068 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 244/ 291]     layers.26.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 245/ 291]     layers.26.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 246/ 291]            layers.26.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 247/ 291]        layers.27.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 248/ 291]        layers.27.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 249/ 291]        layers.27.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 250/ 291]        layers.27.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 251/ 291]      layers.27.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 252/ 291]     layers.27.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 253/ 291]     layers.27.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 254/ 291]     layers.27.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 255/ 291]            layers.27.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 256/ 291]        layers.28.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 257/ 291]        layers.28.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 258/ 291]        layers.28.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 259/ 291]        layers.28.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 260/ 291]      layers.28.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 261/ 291]     layers.28.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 262/ 291]     layers.28.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 263/ 291]     layers.28.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 264/ 291]            layers.28.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 265/ 291]        layers.29.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 266/ 291]        layers.29.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 267/ 291]        layers.29.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 268/ 291]        layers.29.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 269/ 291]      layers.29.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 270/ 291]     layers.29.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 271/ 291]     layers.29.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.107 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 272/ 291]     layers.29.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 273/ 291]            layers.29.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 274/ 291]        layers.30.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[ 275/ 291]        layers.30.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 276/ 291]        layers.30.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
-            "[ 277/ 291]        layers.30.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 278/ 291]      layers.30.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 279/ 291]     layers.30.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 280/ 291]     layers.30.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.019 0.030 0.046 0.066 0.088 0.108 0.232 0.108 0.088 0.066 0.046 0.031 0.019 0.027 \n",
-            "[ 281/ 291]     layers.30.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 282/ 291]            layers.30.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 283/ 291]        layers.31.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 284/ 291]        layers.31.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 285/ 291]        layers.31.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
-            "[ 286/ 291]        layers.31.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 287/ 291]      layers.31.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "[ 288/ 291]     layers.31.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 289/ 291]     layers.31.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.019 0.030 0.045 0.065 0.088 0.109 0.237 0.109 0.088 0.065 0.045 0.030 0.019 0.027 \n",
-            "[ 290/ 291]     layers.31.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "[ 291/ 291]            layers.31.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
-            "llama_model_quantize_internal: model size  = 13133.55 MB\n",
-            "llama_model_quantize_internal: quant size  =  7388.06 MB\n",
-            "llama_model_quantize_internal: hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
-            "\n",
-            "main: quantize time = 146381.23 ms\n",
-            "main:    total time = 146381.23 ms\n"
-          ]
-        }
-      ],
-      "source": [
-        "!cd llama.cpp && ./quantize ./zh-models/7B/ggml-model-f16.bin ./zh-models/7B/ggml-model-q8_0.bin 7"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!sha256sum ./llama.cpp/zh-models/7B/ggml-model-q8_0.bin"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "2PR5jo2P-hOw",
-        "outputId": "2d808543-557d-4d0a-becb-ab35c4ccb8ff"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "0eec8927427f159397c79961a28d62d78849514a4a19033b247edd6ac3fc2cfd  ./llama.cpp/zh-models/7B/ggml-model-q8_0.bin\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DLkuRAo9Vkb1"
-      },
-      "source": [
-        "### （可选）测试量化模型解码\n",
-        "至此已完成了所有转换步骤。\n",
-        "我们运行一条命令测试一下是否能够正常加载并进行对话。\n",
-        "\n",
-        "FP16和Q8量化文件存放在./llama.cpp/zh-models/7B下，可按需下载使用。"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "tW-ep1BsVQtG",
-        "outputId": "b3b28e5e-c731-4bb5-d3ae-c09d4c7bfb81"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "main: seed = 1682671021\n",
-            "llama.cpp: loading model from ./zh-models/7B/ggml-model-q8_0.bin\n",
-            "llama_model_load_internal: format     = ggjt v1 (latest)\n",
-            "llama_model_load_internal: n_vocab    = 49954\n",
-            "llama_model_load_internal: n_ctx      = 512\n",
-            "llama_model_load_internal: n_embd     = 4096\n",
-            "llama_model_load_internal: n_mult     = 256\n",
-            "llama_model_load_internal: n_head     = 32\n",
-            "llama_model_load_internal: n_layer    = 32\n",
-            "llama_model_load_internal: n_rot      = 128\n",
-            "llama_model_load_internal: ftype      = 7 (mostly Q8_0)\n",
-            "llama_model_load_internal: n_ff       = 11008\n",
-            "llama_model_load_internal: n_parts    = 1\n",
-            "llama_model_load_internal: model size = 7B\n",
-            "llama_model_load_internal: ggml ctx size =  59.11 KB\n",
-            "llama_model_load_internal: mem required  = 9180.12 MB (+ 1026.00 MB per state)\n",
-            "llama_init_from_file: kv self size  =  256.00 MB\n",
-            "\n",
-            "system_info: n_threads = 4 / 4 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
-            "sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.100000\n",
-            "generate: n_ctx = 512, n_batch = 512, n_predict = 512, n_keep = 0\n",
-            "\n",
-            "\n",
-            "\u001b[33m 详细介绍一下北京的名胜古迹：\u001b[0m长城、故宫等。同时介绍一些小众景点，比如颐和园中的石舫、圆明园中的琉璃花门等等。 [end of text]\n",
-            "\n",
-            "llama_print_timings:        load time = 19881.66 ms\n",
-            "llama_print_timings:      sample time =    48.31 ms /    32 runs   (    1.51 ms per run)\n",
-            "llama_print_timings: prompt eval time = 11365.17 ms /    11 tokens ( 1033.20 ms per token)\n",
-            "llama_print_timings:        eval time = 33910.03 ms /    31 runs   ( 1093.87 ms per run)\n",
-            "llama_print_timings:       total time = 53841.09 ms\n"
-          ]
-        }
-      ],
-      "source": [
-        "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q8_0.bin --color -f ./prompts/alpaca.txt -p \"详细介绍一下北京的名胜古迹：\" -n 512"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "TPU",
-    "colab": {
-      "machine_shape": "hm",
-      "provenance": []
-    },
-    "gpuClass": "premium",
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/notebooks/convert_and_quantize_chinese_llama.ipynb b/notebooks/convert_and_quantize_chinese_llama.ipynb
deleted file mode 100644
index ce077f3..0000000
--- a/notebooks/convert_and_quantize_chinese_llama.ipynb
+++ /dev/null
@@ -1,1874 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "machine_shape": "hm"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "gpuClass": "standard",
-    "accelerator": "TPU"
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# 转换并量化中文LLaMA/Alpaca模型\n",
-        "\n",
-        "🎉🎉🎉 **新：现在免费用户也有机会能够转换7B和13B模型了！**\n",
-        "\n",
-        "💡 提示和小窍门：\n",
-        "- 免费用户默认的内存只有12G左右，**笔者用免费账号实测选择TPU的话有机会随机出35G内存**，建议多试几次。如果能随机出25G内存以上的机器就可以了转换7B模型了，35G内存以上机器就能转换13B模型了\n",
-        "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n",
-        "- 实测：转换7B级别模型，25G内存的机器就够了；转换13B级别模型需要30G以上的内存（程序莫名崩掉或断开连接就说明内存爆了）\n",
-        "- 如果选了“高RAM”之后内存还是不够大的话，选择以下操作，有的时候会分配出很高内存的机器，祝你好运😄！\n",
-        "    - 可以把GPU或者TPU也选上（虽然不会用到）\n",
-        "    - 选GPU时，Pro用户可选“高级”类型GPU\n",
-        "\n",
-        "以下信息配置信息供参考（Pro订阅下测试），运行时规格设置为“高RAM”时的设备配置如下（有随机性）：\n",
-        "\n",
-        "| 硬件加速器  |  RAM  |  硬盘  |\n",
-        "| :-- | :--: | :--: |\n",
-        "| None | 25GB | 225GB |\n",
-        "| TPU | 35GB | 225GB |\n",
-        "| GPU（标准，T4）| 25GB | 166GB |\n",
-        "| GPU（高性能，V100）| 25GB | 166GB |\n",
-        "| GPU（高性能，A100）| **80GB** | 166GB |\n",
-        "\n",
-        "*温馨提示：用完之后注意断开运行时，选择满足要求的最低配置即可，避免不必要的计算单元消耗（Pro只给100个计算单元）。*"
-      ],
-      "metadata": {
-        "id": "B1c96_k3MahN"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## 安装相关依赖"
-      ],
-      "metadata": {
-        "id": "vScqHD_jMFOV"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "E5WKFJXIL6ZU",
-        "outputId": "7ce317e5-c105-49a8-d1af-70c29e6246e1"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
-            "Collecting transformers\n",
-            "  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.0/7.0 MB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.27.1)\n",
-            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.24.2)\n",
-            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (6.0)\n",
-            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.65.0)\n",
-            "Collecting huggingface-hub<1.0,>=0.11.0\n",
-            "  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m200.1/200.1 kB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.11.0)\n",
-            "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
-            "  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m97.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.0)\n",
-            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
-            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2022.12.7)\n",
-            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.15)\n",
-            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (3.4)\n",
-            "Installing collected packages: tokenizers, huggingface-hub, transformers\n",
-            "Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0\n",
-            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
-            "Collecting peft\n",
-            "  Downloading peft-0.2.0-py3-none-any.whl (40 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.3/40.3 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.9/dist-packages (from peft) (5.9.4)\n",
-            "Requirement already satisfied: transformers in /usr/local/lib/python3.9/dist-packages (from peft) (4.28.0)\n",
-            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.9/dist-packages (from peft) (6.0)\n",
-            "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.9/dist-packages (from peft) (2.0.0+cu118)\n",
-            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from peft) (23.0)\n",
-            "Collecting accelerate\n",
-            "  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.3/215.3 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from peft) (1.24.2)\n",
-            "Requirement already satisfied: sympy in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (1.11.1)\n",
-            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.1.2)\n",
-            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (2.0.0)\n",
-            "Requirement already satisfied: networkx in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.1)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.11.0)\n",
-            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (4.5.0)\n",
-            "Requirement already satisfied: lit in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch>=1.13.0->peft) (16.0.1)\n",
-            "Requirement already satisfied: cmake in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch>=1.13.0->peft) (3.25.2)\n",
-            "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (0.13.4)\n",
-            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (2022.10.31)\n",
-            "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (2.27.1)\n",
-            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (0.13.3)\n",
-            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (4.65.0)\n",
-            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->torch>=1.13.0->peft) (2.1.2)\n",
-            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (2.0.12)\n",
-            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (3.4)\n",
-            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (1.26.15)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (2022.12.7)\n",
-            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.9/dist-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\n",
-            "Installing collected packages: accelerate, peft\n",
-            "Successfully installed accelerate-0.18.0 peft-0.2.0\n",
-            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
-            "Collecting sentencepiece\n",
-            "  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hInstalling collected packages: sentencepiece\n",
-            "Successfully installed sentencepiece-0.1.98\n"
-          ]
-        }
-      ],
-      "source": [
-        "!pip install transformers\n",
-        "!pip install peft\n",
-        "!pip install sentencepiece"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## 克隆目录和代码"
-      ],
-      "metadata": {
-        "id": "ygb1xFIMNQKw"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n",
-        "!git clone https://github.com/ggerganov/llama.cpp"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "yCEJh7NJNXz9",
-        "outputId": "91a0e4ff-af63-4f8e-ab82-ee4ddf583033"
-      },
-      "execution_count": 2,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Cloning into 'Chinese-LLaMA-Alpaca'...\n",
-            "remote: Enumerating objects: 559, done.\u001b[K\n",
-            "remote: Counting objects: 100% (129/129), done.\u001b[K\n",
-            "remote: Compressing objects: 100% (115/115), done.\u001b[K\n",
-            "remote: Total 559 (delta 30), reused 22 (delta 14), pack-reused 430\u001b[K\n",
-            "Receiving objects: 100% (559/559), 10.71 MiB | 25.49 MiB/s, done.\n",
-            "Resolving deltas: 100% (333/333), done.\n",
-            "Cloning into 'llama.cpp'...\n",
-            "remote: Enumerating objects: 1701, done.\u001b[K\n",
-            "remote: Counting objects: 100% (1701/1701), done.\u001b[K\n",
-            "remote: Compressing objects: 100% (620/620), done.\u001b[K\n",
-            "remote: Total 1701 (delta 1084), reused 1623 (delta 1047), pack-reused 0\u001b[K\n",
-            "Receiving objects: 100% (1701/1701), 1.86 MiB | 14.74 MiB/s, done.\n",
-            "Resolving deltas: 100% (1084/1084), done.\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## 合并模型（以Alpaca-7B为例）\n",
-        "\n",
-        "**⚠️ 再次提醒：7B模型需要25G内存，13B模型需要35G+内存。**\n",
-        "\n",
-        "此处使用的是🤗模型库中提供的基模型（已是HF格式），而不是Facebook官方的LLaMA模型，因此略去将原版LLaMA转换为HF格式的步骤。\n",
-        "\n",
-        "**这里直接运行第二步：合并LoRA权重**，生成全量模型权重。可以直接指定🤗模型库的地址，也可以是本地存放地址。\n",
-        "- 基模型：`decapoda-research/llama-7b-hf` *（use at your own risk）*\n",
-        "- LoRA模型：`ziqingyang/chinese-alpaca-lora-7b`\n",
-        "\n",
-        "💡 转换13B模型提示：\n",
-        "- 请将参数`--base_model`和`--lora_model`中的的`7b`改为`13b`即可\n",
-        "- **免费用户必须增加一个参数`--offload_dir`以缓解内存压力**，例如`--offload_dir ./offload_temp`\n",
-        "\n",
-        "该过程比较耗时（下载+转换），需要几分钟到十几分钟不等，请耐心等待。\n",
-        "转换好的模型存放在`alpaca-combined`目录。\n",
-        "如果你不需要量化模型，那么到这一步就结束了。"
-      ],
-      "metadata": {
-        "id": "nIyxX0DSNsgQ"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora.py \\\n",
-        "    --base_model 'decapoda-research/llama-7b-hf' \\\n",
-        "    --lora_model 'ziqingyang/chinese-alpaca-lora-7b' \\\n",
-        "    --output_dir alpaca-combined"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "5AV4EW5hNhVV",
-        "outputId": "e34419d4-b7c9-4e22-af37-abf80d4163ba"
-      },
-      "execution_count": 3,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "2023-04-14 10:13:45.382526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-            "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 12.7MB/s]\n",
-            "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 15.3kB/s]\n",
-            "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 63.2kB/s]\n",
-            "Downloading (…)lve/main/config.json: 100% 427/427 [00:00<00:00, 63.4kB/s]\n",
-            "Downloading (…)model.bin.index.json: 100% 25.5k/25.5k [00:00<00:00, 9.41MB/s]\n",
-            "Downloading shards:   0% 0/33 [00:00<?, ?it/s]\n",
-            "Downloading (…)l-00001-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 95.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 155MB/s] \u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  16% 62.9M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  31% 126M/405M [00:00<00:01, 205MB/s] \u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  39% 157M/405M [00:00<00:01, 208MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  47% 189M/405M [00:00<00:01, 210MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  54% 220M/405M [00:01<00:00, 213MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  62% 252M/405M [00:01<00:00, 214MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  70% 283M/405M [00:01<00:00, 215MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  78% 315M/405M [00:01<00:00, 216MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  85% 346M/405M [00:01<00:00, 216MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin:  93% 377M/405M [00:01<00:00, 214MB/s]\u001b[A\n",
-            "Downloading (…)l-00001-of-00033.bin: 100% 405M/405M [00:01<00:00, 205MB/s]\n",
-            "Downloading shards:   3% 1/33 [00:02<01:07,  2.11s/it]\n",
-            "Downloading (…)l-00002-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 91.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 150MB/s] \u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 186MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  28% 115M/405M [00:00<00:01, 192MB/s] \u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  34% 136M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  39% 157M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  44% 178M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  54% 220M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  60% 241M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  80% 325M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  85% 346M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00002-of-00033.bin: 100% 405M/405M [00:02<00:00, 194MB/s]\n",
-            "Downloading shards:   6% 2/33 [00:04<01:07,  2.17s/it]\n",
-            "Downloading (…)l-00003-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 90.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 143MB/s] \u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 166MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 179MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  28% 115M/405M [00:00<00:01, 190MB/s] \u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  34% 136M/405M [00:00<00:01, 192MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  39% 157M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  44% 178M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  49% 199M/405M [00:01<00:01, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  54% 220M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  60% 241M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  65% 262M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  70% 283M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  75% 304M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  80% 325M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  85% 346M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00003-of-00033.bin: 100% 405M/405M [00:02<00:00, 187MB/s]\n",
-            "Downloading shards:   9% 3/33 [00:06<01:06,  2.23s/it]\n",
-            "Downloading (…)l-00004-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 175MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  28% 115M/405M [00:00<00:01, 195MB/s] \u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  34% 136M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  39% 157M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  44% 178M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  49% 199M/405M [00:01<00:01, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  54% 220M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  60% 241M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  65% 262M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  70% 283M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  80% 325M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  85% 346M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00004-of-00033.bin: 100% 405M/405M [00:02<00:00, 195MB/s]\n",
-            "Downloading shards:  12% 4/33 [00:08<01:04,  2.22s/it]\n",
-            "Downloading (…)l-00005-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 86.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 144MB/s] \u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 167MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 178MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  28% 115M/405M [00:00<00:01, 189MB/s] \u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  34% 136M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  39% 157M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  44% 178M/405M [00:00<00:01, 192MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  49% 199M/405M [00:01<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  54% 220M/405M [00:01<00:00, 192MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  60% 241M/405M [00:01<00:00, 192MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  65% 262M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  70% 283M/405M [00:01<00:00, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  75% 304M/405M [00:01<00:00, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  80% 325M/405M [00:01<00:00, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  85% 346M/405M [00:01<00:00, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin:  91% 367M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00005-of-00033.bin: 100% 405M/405M [00:02<00:00, 188MB/s]\n",
-            "Downloading shards:  15% 5/33 [00:11<01:03,  2.26s/it]\n",
-            "Downloading (…)l-00006-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 90.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 150MB/s] \u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 186MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  28% 115M/405M [00:00<00:01, 190MB/s] \u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  34% 136M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  39% 157M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  44% 178M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  49% 199M/405M [00:01<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  54% 220M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  60% 241M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  65% 262M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  70% 283M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  75% 304M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  80% 325M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  85% 346M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin:  91% 367M/405M [00:01<00:00, 191MB/s]\u001b[A\n",
-            "Downloading (…)l-00006-of-00033.bin: 100% 405M/405M [00:02<00:00, 190MB/s]\n",
-            "Downloading shards:  18% 6/33 [00:13<01:01,  2.26s/it]\n",
-            "Downloading (…)l-00007-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 176MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  28% 115M/405M [00:00<00:01, 198MB/s] \u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  34% 136M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  39% 157M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  44% 178M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  49% 199M/405M [00:01<00:01, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  54% 220M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  60% 241M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  65% 262M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  70% 283M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  75% 304M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  80% 325M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  85% 346M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin:  91% 367M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00007-of-00033.bin: 100% 405M/405M [00:02<00:00, 197MB/s]\n",
-            "Downloading shards:  21% 7/33 [00:15<00:58,  2.24s/it]\n",
-            "Downloading (…)l-00008-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 153MB/s] \u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  28% 115M/405M [00:00<00:01, 199MB/s] \u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  34% 136M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  39% 157M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  44% 178M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  54% 220M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  60% 241M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  75% 304M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  80% 325M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  85% 346M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin:  91% 367M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00008-of-00033.bin: 100% 405M/405M [00:02<00:00, 197MB/s]\n",
-            "Downloading shards:  24% 8/33 [00:17<00:55,  2.22s/it]\n",
-            "Downloading (…)l-00009-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 89.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 149MB/s] \u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  28% 115M/405M [00:00<00:01, 195MB/s] \u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  34% 136M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  39% 157M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  44% 178M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  49% 199M/405M [00:01<00:01, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  54% 220M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  60% 241M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  65% 262M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  70% 283M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  80% 325M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  85% 346M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00009-of-00033.bin: 100% 405M/405M [00:02<00:00, 194MB/s]\n",
-            "Downloading shards:  27% 9/33 [00:20<00:53,  2.22s/it]\n",
-            "Downloading (…)l-00010-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 175MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 186MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  28% 115M/405M [00:00<00:01, 196MB/s] \u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  34% 136M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  39% 157M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  44% 178M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  52% 210M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  57% 231M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  62% 252M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  67% 273M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  73% 294M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  78% 315M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  83% 336M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  88% 357M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin:  93% 377M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00010-of-00033.bin: 100% 405M/405M [00:02<00:00, 196MB/s]\n",
-            "Downloading shards:  30% 10/33 [00:22<00:50,  2.22s/it]\n",
-            "Downloading (…)l-00011-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 149MB/s] \u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 183MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  28% 115M/405M [00:00<00:01, 195MB/s] \u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  34% 136M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  39% 157M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  44% 178M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  49% 199M/405M [00:01<00:01, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  54% 220M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  60% 241M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  80% 325M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  85% 346M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00011-of-00033.bin: 100% 405M/405M [00:02<00:00, 195MB/s]\n",
-            "Downloading shards:  33% 11/33 [00:24<00:48,  2.21s/it]\n",
-            "Downloading (…)l-00012-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 86.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 143MB/s] \u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 165MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  28% 115M/405M [00:00<00:01, 189MB/s] \u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  34% 136M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  39% 157M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  44% 178M/405M [00:00<00:01, 192MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  49% 199M/405M [00:01<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  54% 220M/405M [00:01<00:00, 192MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  60% 241M/405M [00:01<00:00, 191MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  65% 262M/405M [00:01<00:00, 191MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  70% 283M/405M [00:01<00:00, 192MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  75% 304M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  80% 325M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  85% 346M/405M [00:01<00:00, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin:  91% 367M/405M [00:01<00:00, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00012-of-00033.bin: 100% 405M/405M [00:02<00:00, 186MB/s]\n",
-            "Downloading shards:  36% 12/33 [00:26<00:47,  2.25s/it]\n",
-            "Downloading (…)l-00013-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 151MB/s] \u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 175MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  28% 115M/405M [00:00<00:01, 197MB/s] \u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  34% 136M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  39% 157M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  44% 178M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  49% 199M/405M [00:01<00:01, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  54% 220M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  60% 241M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  75% 304M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  80% 325M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  85% 346M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin:  91% 367M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00013-of-00033.bin: 100% 405M/405M [00:02<00:00, 195MB/s]\n",
-            "Downloading shards:  39% 13/33 [00:28<00:44,  2.23s/it]\n",
-            "Downloading (…)l-00014-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:   3% 10.5M/405M [00:02<01:50, 3.56MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:   5% 21.0M/405M [00:04<01:10, 5.46MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:   8% 31.5M/405M [00:04<00:50, 7.45MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  10% 41.9M/405M [00:05<00:37, 9.67MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  13% 52.4M/405M [00:06<00:29, 12.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  16% 62.9M/405M [00:06<00:22, 14.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  18% 73.4M/405M [00:06<00:18, 17.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  21% 83.9M/405M [00:07<00:16, 20.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  23% 94.4M/405M [00:07<00:14, 21.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  26% 105M/405M [00:07<00:12, 23.5MB/s] \u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  28% 115M/405M [00:08<00:11, 24.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  31% 126M/405M [00:08<00:10, 25.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  34% 136M/405M [00:09<00:10, 26.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  36% 147M/405M [00:09<00:09, 26.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  39% 157M/405M [00:09<00:09, 27.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  41% 168M/405M [00:10<00:08, 27.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  44% 178M/405M [00:10<00:08, 27.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  47% 189M/405M [00:10<00:07, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  49% 199M/405M [00:11<00:07, 27.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  52% 210M/405M [00:11<00:07, 27.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  54% 220M/405M [00:12<00:06, 27.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  57% 231M/405M [00:12<00:06, 27.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  60% 241M/405M [00:12<00:05, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  62% 252M/405M [00:13<00:05, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  65% 262M/405M [00:13<00:05, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  67% 273M/405M [00:13<00:04, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  70% 283M/405M [00:14<00:04, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  73% 294M/405M [00:14<00:03, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  75% 304M/405M [00:15<00:03, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  78% 315M/405M [00:15<00:03, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  80% 325M/405M [00:15<00:02, 27.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  83% 336M/405M [00:16<00:02, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  85% 346M/405M [00:16<00:02, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  88% 357M/405M [00:16<00:01, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  91% 367M/405M [00:17<00:01, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  93% 377M/405M [00:17<00:00, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  96% 388M/405M [00:18<00:00, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin:  98% 398M/405M [00:18<00:00, 27.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00014-of-00033.bin: 100% 405M/405M [00:18<00:00, 21.7MB/s]\n",
-            "Downloading shards:  42% 14/33 [00:48<02:19,  7.34s/it]\n",
-            "Downloading (…)l-00015-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:   3% 10.5M/405M [00:02<01:20, 4.90MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:   5% 21.0M/405M [00:03<00:54, 7.08MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:   8% 31.5M/405M [00:03<00:40, 9.31MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  10% 41.9M/405M [00:04<00:30, 11.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  13% 52.4M/405M [00:04<00:24, 14.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  16% 62.9M/405M [00:05<00:19, 17.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  18% 73.4M/405M [00:05<00:16, 19.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  21% 83.9M/405M [00:05<00:14, 21.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  23% 94.4M/405M [00:06<00:13, 23.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  26% 105M/405M [00:06<00:12, 24.4MB/s] \u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  28% 115M/405M [00:07<00:11, 25.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  31% 126M/405M [00:07<00:10, 26.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  34% 136M/405M [00:07<00:10, 26.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  36% 147M/405M [00:08<00:09, 26.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  39% 157M/405M [00:08<00:09, 27.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  41% 168M/405M [00:09<00:08, 27.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  44% 178M/405M [00:09<00:08, 27.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  47% 189M/405M [00:09<00:07, 27.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  49% 199M/405M [00:10<00:07, 27.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  52% 210M/405M [00:10<00:07, 27.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  54% 220M/405M [00:10<00:06, 27.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  57% 231M/405M [00:11<00:06, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  60% 241M/405M [00:11<00:05, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  62% 252M/405M [00:12<00:05, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  65% 262M/405M [00:12<00:05, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  67% 273M/405M [00:12<00:04, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  70% 283M/405M [00:13<00:04, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  73% 294M/405M [00:13<00:04, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  75% 304M/405M [00:13<00:03, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  78% 315M/405M [00:14<00:03, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  80% 325M/405M [00:14<00:02, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  83% 336M/405M [00:15<00:02, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  85% 346M/405M [00:15<00:02, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  88% 357M/405M [00:15<00:01, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  91% 367M/405M [00:16<00:01, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  93% 377M/405M [00:16<00:00, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  96% 388M/405M [00:16<00:00, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin:  98% 398M/405M [00:17<00:00, 27.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00015-of-00033.bin: 100% 405M/405M [00:17<00:00, 23.0MB/s]\n",
-            "Downloading shards:  45% 15/33 [01:06<03:10, 10.56s/it]\n",
-            "Downloading (…)l-00016-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 91.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 144MB/s] \u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 171MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  28% 115M/405M [00:00<00:01, 191MB/s] \u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  36% 147M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  41% 168M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  47% 189M/405M [00:00<00:01, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  52% 210M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  57% 231M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  62% 252M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  67% 273M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  73% 294M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  78% 315M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  83% 336M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  88% 357M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin:  93% 377M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00016-of-00033.bin: 100% 405M/405M [00:02<00:00, 196MB/s]\n",
-            "Downloading shards:  48% 16/33 [01:08<02:16,  8.06s/it]\n",
-            "Downloading (…)l-00017-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 90.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 143MB/s] \u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 169MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 183MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  28% 115M/405M [00:00<00:01, 189MB/s] \u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  34% 136M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  39% 157M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  44% 178M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  49% 199M/405M [00:01<00:01, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  54% 220M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  60% 241M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  65% 262M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  70% 283M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  75% 304M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  80% 325M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  85% 346M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin:  91% 367M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00017-of-00033.bin: 100% 405M/405M [00:02<00:00, 194MB/s]\n",
-            "Downloading shards:  52% 17/33 [01:10<01:40,  6.30s/it]\n",
-            "Downloading (…)l-00018-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 89.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 144MB/s] \u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 170MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 183MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  28% 115M/405M [00:00<00:01, 194MB/s] \u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  34% 136M/405M [00:00<00:01, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  39% 157M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  44% 178M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  54% 220M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  60% 241M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  65% 262M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  70% 283M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  75% 304M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  80% 325M/405M [00:01<00:00, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  85% 346M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin:  91% 367M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00018-of-00033.bin: 100% 405M/405M [00:02<00:00, 192MB/s]\n",
-            "Downloading shards:  55% 18/33 [01:12<01:16,  5.09s/it]\n",
-            "Downloading (…)l-00019-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 85.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 142MB/s] \u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 167MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  28% 115M/405M [00:00<00:01, 185MB/s] \u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  34% 136M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  39% 157M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  44% 178M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  49% 199M/405M [00:01<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  54% 220M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  60% 241M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  65% 262M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  70% 283M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  75% 304M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  80% 325M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  85% 346M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin:  91% 367M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00019-of-00033.bin: 100% 405M/405M [00:02<00:00, 189MB/s]\n",
-            "Downloading shards:  58% 19/33 [01:15<00:59,  4.24s/it]\n",
-            "Downloading (…)l-00020-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 85.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 144MB/s] \u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 169MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 181MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  28% 115M/405M [00:00<00:01, 191MB/s] \u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  34% 136M/405M [00:00<00:01, 192MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  39% 157M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  44% 178M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  49% 199M/405M [00:01<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  54% 220M/405M [00:01<00:00, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  60% 241M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  65% 262M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  70% 283M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  75% 304M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  80% 325M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  88% 357M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin:  93% 377M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00020-of-00033.bin: 100% 405M/405M [00:02<00:00, 192MB/s]\n",
-            "Downloading shards:  61% 20/33 [01:17<00:47,  3.64s/it]\n",
-            "Downloading (…)l-00021-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 84.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 141MB/s] \u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 168MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 181MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  28% 115M/405M [00:00<00:01, 192MB/s] \u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  34% 136M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  39% 157M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  44% 178M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  54% 220M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  60% 241M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  65% 262M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  70% 283M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  75% 304M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  80% 325M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  85% 346M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin:  91% 367M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00021-of-00033.bin: 100% 405M/405M [00:02<00:00, 196MB/s]\n",
-            "Downloading shards:  64% 21/33 [01:19<00:38,  3.21s/it]\n",
-            "Downloading (…)l-00022-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 89.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 147MB/s] \u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 169MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 179MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  28% 115M/405M [00:00<00:01, 194MB/s] \u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  34% 136M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  39% 157M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  44% 178M/405M [00:00<00:01, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  49% 199M/405M [00:01<00:01, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  54% 220M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  60% 241M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  80% 325M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  85% 346M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00022-of-00033.bin: 100% 405M/405M [00:02<00:00, 193MB/s]\n",
-            "Downloading shards:  67% 22/33 [01:21<00:32,  2.92s/it]\n",
-            "Downloading (…)l-00023-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 153MB/s] \u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 176MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  28% 115M/405M [00:00<00:01, 199MB/s] \u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  34% 136M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  39% 157M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  44% 178M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  54% 220M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  60% 241M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  65% 262M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  70% 283M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  75% 304M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  80% 325M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  85% 346M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin:  91% 367M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00023-of-00033.bin: 100% 405M/405M [00:02<00:00, 197MB/s]\n",
-            "Downloading shards:  70% 23/33 [01:23<00:27,  2.70s/it]\n",
-            "Downloading (…)l-00024-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  21% 83.9M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  28% 115M/405M [00:00<00:01, 200MB/s] \u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  34% 136M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  41% 168M/405M [00:00<00:01, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  49% 199M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  57% 231M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  62% 252M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  67% 273M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  73% 294M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  80% 325M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  85% 346M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin:  93% 377M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00024-of-00033.bin: 100% 405M/405M [00:02<00:00, 199MB/s]\n",
-            "Downloading shards:  73% 24/33 [01:26<00:22,  2.54s/it]\n",
-            "Downloading (…)l-00025-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 175MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  28% 115M/405M [00:00<00:01, 195MB/s] \u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  34% 136M/405M [00:00<00:01, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  39% 157M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  44% 178M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  49% 199M/405M [00:01<00:01, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  54% 220M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  60% 241M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  65% 262M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  70% 283M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  75% 304M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  80% 325M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  85% 346M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin:  91% 367M/405M [00:01<00:00, 208MB/s]\u001b[A\n",
-            "Downloading (…)l-00025-of-00033.bin: 100% 405M/405M [00:02<00:00, 198MB/s]\n",
-            "Downloading shards:  76% 25/33 [01:28<00:19,  2.43s/it]\n",
-            "Downloading (…)l-00026-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 88.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 148MB/s] \u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 174MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  28% 115M/405M [00:00<00:01, 199MB/s] \u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  34% 136M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  39% 157M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  44% 178M/405M [00:00<00:01, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  49% 199M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  54% 220M/405M [00:01<00:00, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  60% 241M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  65% 262M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  78% 315M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  83% 336M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  88% 357M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin:  93% 377M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00026-of-00033.bin: 100% 405M/405M [00:02<00:00, 196MB/s]\n",
-            "Downloading shards:  79% 26/33 [01:30<00:16,  2.37s/it]\n",
-            "Downloading (…)l-00027-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 153MB/s] \u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  31% 126M/405M [00:00<00:01, 203MB/s] \u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  36% 147M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  41% 168M/405M [00:00<00:01, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  47% 189M/405M [00:00<00:01, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  52% 210M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  57% 231M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  62% 252M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  67% 273M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  73% 294M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  78% 315M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  83% 336M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  88% 357M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin:  93% 377M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
-            "Downloading (…)l-00027-of-00033.bin: 100% 405M/405M [00:02<00:00, 198MB/s]\n",
-            "Downloading shards:  82% 27/33 [01:32<00:13,  2.31s/it]\n",
-            "Downloading (…)l-00028-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 154MB/s] \u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 176MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  28% 115M/405M [00:00<00:01, 199MB/s] \u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  34% 136M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  39% 157M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  44% 178M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  49% 199M/405M [00:01<00:01, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  54% 220M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  60% 241M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  65% 262M/405M [00:01<00:00, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  70% 283M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  75% 304M/405M [00:01<00:00, 195MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  80% 325M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  85% 346M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin:  91% 367M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
-            "Downloading (…)l-00028-of-00033.bin: 100% 405M/405M [00:02<00:00, 193MB/s]\n",
-            "Downloading shards:  85% 28/33 [01:34<00:11,  2.29s/it]\n",
-            "Downloading (…)l-00029-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 91.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 149MB/s] \u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 175MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  28% 115M/405M [00:00<00:01, 191MB/s] \u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  34% 136M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  39% 157M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  44% 178M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  49% 199M/405M [00:01<00:01, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  54% 220M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  60% 241M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  65% 262M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  70% 283M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  75% 304M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  80% 325M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  85% 346M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin:  91% 367M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00029-of-00033.bin: 100% 405M/405M [00:02<00:00, 192MB/s]\n",
-            "Downloading shards:  88% 29/33 [01:37<00:09,  2.28s/it]\n",
-            "Downloading (…)l-00030-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 89.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 149MB/s] \u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 186MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  28% 115M/405M [00:00<00:01, 197MB/s] \u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  34% 136M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  39% 157M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  44% 178M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  49% 199M/405M [00:01<00:01, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  54% 220M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  60% 241M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  65% 262M/405M [00:01<00:00, 187MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  70% 283M/405M [00:01<00:00, 190MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  75% 304M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  80% 325M/405M [00:01<00:00, 196MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  85% 346M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
-            "Downloading (…)l-00030-of-00033.bin: 100% 405M/405M [00:02<00:00, 192MB/s]\n",
-            "Downloading shards:  91% 30/33 [01:39<00:06,  2.27s/it]\n",
-            "Downloading (…)l-00031-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:   3% 10.5M/405M [00:00<00:05, 70.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:   5% 21.0M/405M [00:00<00:04, 82.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:   8% 31.5M/405M [00:00<00:04, 84.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  10% 41.9M/405M [00:00<00:04, 87.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  13% 52.4M/405M [00:00<00:03, 89.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  16% 62.9M/405M [00:00<00:03, 90.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  18% 73.4M/405M [00:00<00:03, 86.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  21% 83.9M/405M [00:00<00:03, 87.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  23% 94.4M/405M [00:01<00:03, 78.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  26% 105M/405M [00:01<00:03, 80.1MB/s] \u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  28% 115M/405M [00:01<00:03, 84.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  31% 126M/405M [00:01<00:03, 82.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  34% 136M/405M [00:01<00:03, 85.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  36% 147M/405M [00:01<00:02, 88.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  39% 157M/405M [00:01<00:02, 87.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  41% 168M/405M [00:01<00:02, 87.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  44% 178M/405M [00:02<00:02, 90.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  47% 189M/405M [00:02<00:02, 90.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  49% 199M/405M [00:02<00:02, 89.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  52% 210M/405M [00:02<00:02, 92.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  54% 220M/405M [00:02<00:02, 88.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  57% 231M/405M [00:02<00:01, 90.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  60% 241M/405M [00:02<00:01, 91.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  62% 252M/405M [00:02<00:01, 91.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  65% 262M/405M [00:02<00:01, 91.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  67% 273M/405M [00:03<00:01, 91.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  70% 283M/405M [00:03<00:01, 93.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  73% 294M/405M [00:03<00:01, 93.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  75% 304M/405M [00:03<00:01, 94.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  78% 315M/405M [00:03<00:00, 92.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  80% 325M/405M [00:03<00:00, 91.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  83% 336M/405M [00:03<00:00, 91.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  85% 346M/405M [00:03<00:00, 89.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  88% 357M/405M [00:04<00:00, 91.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  91% 367M/405M [00:04<00:00, 92.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  93% 377M/405M [00:04<00:00, 93.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin:  96% 388M/405M [00:04<00:00, 93.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00031-of-00033.bin: 100% 405M/405M [00:04<00:00, 89.2MB/s]\n",
-            "Downloading shards:  94% 31/33 [01:44<00:06,  3.01s/it]\n",
-            "Downloading (…)l-00032-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:   3% 10.5M/405M [00:00<00:05, 72.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:   5% 21.0M/405M [00:00<00:04, 84.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:   8% 31.5M/405M [00:00<00:04, 90.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  10% 41.9M/405M [00:00<00:03, 92.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  13% 52.4M/405M [00:00<00:03, 92.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  16% 62.9M/405M [00:00<00:03, 93.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  18% 73.4M/405M [00:00<00:03, 93.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  21% 83.9M/405M [00:00<00:03, 94.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  23% 94.4M/405M [00:01<00:03, 94.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  26% 105M/405M [00:01<00:03, 96.0MB/s] \u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  28% 115M/405M [00:01<00:03, 96.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  31% 126M/405M [00:01<00:02, 94.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  34% 136M/405M [00:01<00:03, 88.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  36% 147M/405M [00:01<00:02, 88.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  39% 157M/405M [00:01<00:02, 90.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  41% 168M/405M [00:01<00:02, 91.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  44% 178M/405M [00:01<00:02, 92.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  47% 189M/405M [00:02<00:02, 91.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  49% 199M/405M [00:02<00:02, 91.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  52% 210M/405M [00:02<00:02, 92.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  54% 220M/405M [00:02<00:02, 89.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  57% 231M/405M [00:02<00:01, 90.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  60% 241M/405M [00:02<00:01, 90.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  62% 252M/405M [00:02<00:01, 90.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  65% 262M/405M [00:02<00:01, 91.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  67% 273M/405M [00:02<00:01, 93.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  70% 283M/405M [00:03<00:01, 93.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  73% 294M/405M [00:03<00:01, 94.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  75% 304M/405M [00:03<00:01, 93.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  78% 315M/405M [00:03<00:00, 92.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  80% 325M/405M [00:03<00:00, 93.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  83% 336M/405M [00:03<00:00, 91.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  85% 346M/405M [00:03<00:00, 89.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  88% 357M/405M [00:03<00:00, 92.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  91% 367M/405M [00:03<00:00, 92.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  93% 377M/405M [00:04<00:00, 93.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin:  96% 388M/405M [00:04<00:00, 94.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00032-of-00033.bin: 100% 405M/405M [00:04<00:00, 92.0MB/s]\n",
-            "Downloading shards:  97% 32/33 [01:48<00:03,  3.47s/it]\n",
-            "Downloading (…)l-00033-of-00033.bin:   0% 0.00/524M [00:00<?, ?B/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:   2% 10.5M/524M [00:01<00:50, 10.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:   4% 21.0M/524M [00:01<00:23, 21.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:   8% 41.9M/524M [00:01<00:10, 44.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  10% 52.4M/524M [00:01<00:08, 53.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  12% 62.9M/524M [00:01<00:07, 62.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  14% 73.4M/524M [00:01<00:06, 69.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  16% 83.9M/524M [00:01<00:05, 76.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  18% 94.4M/524M [00:01<00:05, 78.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  20% 105M/524M [00:01<00:05, 82.6MB/s] \u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  22% 115M/524M [00:02<00:04, 85.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  24% 126M/524M [00:02<00:04, 88.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  26% 136M/524M [00:02<00:04, 90.9MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  28% 147M/524M [00:02<00:04, 93.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  30% 157M/524M [00:02<00:03, 94.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  32% 168M/524M [00:02<00:03, 95.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  34% 178M/524M [00:02<00:03, 95.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  36% 189M/524M [00:02<00:03, 95.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  38% 199M/524M [00:02<00:03, 96.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  40% 210M/524M [00:03<00:03, 95.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  42% 220M/524M [00:03<00:03, 96.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  44% 231M/524M [00:03<00:03, 96.1MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  46% 241M/524M [00:03<00:02, 96.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  48% 252M/524M [00:03<00:02, 96.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  50% 262M/524M [00:03<00:02, 92.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  52% 273M/524M [00:03<00:02, 92.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  54% 283M/524M [00:03<00:02, 93.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  56% 294M/524M [00:03<00:02, 94.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  58% 304M/524M [00:04<00:02, 93.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  60% 315M/524M [00:04<00:02, 92.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  62% 325M/524M [00:04<00:02, 92.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  64% 336M/524M [00:04<00:02, 93.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  66% 346M/524M [00:04<00:01, 94.2MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  68% 357M/524M [00:04<00:01, 94.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  70% 367M/524M [00:04<00:01, 93.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  72% 377M/524M [00:04<00:01, 94.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  74% 388M/524M [00:04<00:01, 92.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  76% 398M/524M [00:05<00:01, 93.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  78% 409M/524M [00:05<00:01, 92.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  80% 419M/524M [00:05<00:01, 93.5MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  82% 430M/524M [00:05<00:00, 94.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  84% 440M/524M [00:05<00:00, 94.7MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  86% 451M/524M [00:05<00:00, 94.4MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  88% 461M/524M [00:05<00:00, 95.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  90% 472M/524M [00:05<00:00, 94.6MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  92% 482M/524M [00:05<00:00, 94.3MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  94% 493M/524M [00:06<00:00, 87.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  96% 503M/524M [00:06<00:00, 89.0MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin:  98% 514M/524M [00:06<00:00, 91.8MB/s]\u001b[A\n",
-            "Downloading (…)l-00033-of-00033.bin: 100% 524M/524M [00:06<00:00, 81.3MB/s]\n",
-            "Downloading shards: 100% 33/33 [01:55<00:00,  3.49s/it]\n",
-            "Loading checkpoint shards: 100% 33/33 [00:14<00:00,  2.30it/s]\n",
-            "Downloading (…)neration_config.json: 100% 124/124 [00:00<00:00, 18.4kB/s]\n",
-            "Extended vocabulary size: 49954\n",
-            "Loading LoRA for 7B model\n",
-            "Downloading (…)/adapter_config.json: 100% 472/472 [00:00<00:00, 166kB/s]\n",
-            "Downloading adapter_model.bin: 100% 858M/858M [00:08<00:00, 103MB/s]\n",
-            "Peft version: 0.2.0\n",
-            "Merging model\n",
-            "Saving shard 1 of 1 into alpaca-combined/consolidated.00.pth\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## 量化模型\n",
-        "接下来我们使用[llama.cpp](https://github.com/ggerganov/llama.cpp)工具对上一步生成的全量版本权重进行转换，生成4-bit量化模型。\n",
-        "\n",
-        "### 编译工具\n",
-        "\n",
-        "首先对llama.cpp工具进行编译。"
-      ],
-      "metadata": {
-        "id": "ueexcKo-Q_EW"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!cd llama.cpp && make"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "_GbjsT2wRRCR",
-        "outputId": "8da3382c-6bff-4030-905b-bb4f622766d7"
-      },
-      "execution_count": 4,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "I llama.cpp build info: \n",
-            "I UNAME_S:  Linux\n",
-            "I UNAME_P:  x86_64\n",
-            "I UNAME_M:  x86_64\n",
-            "I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function -pthread -march=native -mtune=native\n",
-            "I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native\n",
-            "I LDFLAGS:  \n",
-            "I CC:       cc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
-            "I CXX:      g++ (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
-            "\n",
-            "cc  -I.              -O3 -DNDEBUG -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function -pthread -march=native -mtune=native   -c ggml.c -o ggml.o\n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c llama.cpp -o llama.o\n",
-            "In file included from \u001b[01m\u001b[Kllama.cpp:6\u001b[m\u001b[K:\n",
-            "\u001b[01m\u001b[Kllama_util.h:60:2:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kextra ‘\u001b[01m\u001b[K;\u001b[m\u001b[K’ [\u001b[01;35m\u001b[K-Wpedantic\u001b[m\u001b[K]\n",
-            "   60 | }\u001b[01;35m\u001b[K;\u001b[m\u001b[K\n",
-            "      |  \u001b[01;35m\u001b[K^\u001b[m\u001b[K\n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c examples/common.cpp -o common.o\n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/main/main.cpp ggml.o llama.o common.o -o main \n",
-            "\n",
-            "====  Run ./main -h for help.  ====\n",
-            "\n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize/quantize.cpp ggml.o llama.o -o quantize \n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity \n",
-            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding \n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "### 模型转换为ggml格式（FP16）\n",
-        "\n",
-        "这一步，我们将模型转换为ggml格式（FP16）。\n",
-        "- 在这之前需要把`alpaca-combined`目录挪个位置，把模型文件放到`llama.cpp/zh-models/7B`下，把`tokenizer.model`放到`llama.cpp/zh-models`\n",
-        "- tokenizer在哪里？\n",
-        "    - `alpaca-combined`目录下有\n",
-        "    - 或者从以下网址下载：https://huggingface.co/ziqingyang/chinese-alpaca-lora-7b/resolve/main/tokenizer.model （注意，Alpaca和LLaMA的`tokenizer.model`不能混用！）\n",
-        "\n",
-        "💡 转换13B模型提示：\n",
-        "- tokenizer可以直接用7B的，13B和7B的相同\n",
-        "- Alpaca和LLaMA的`tokenizer.model`不能混用！\n",
-        "- 以下看到7B字样的都是文件夹名，与转换过程没有关系了，改不改都行"
-      ],
-      "metadata": {
-        "id": "gw2xpYC0RcQC"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!cd llama.cpp && mkdir zh-models && mv ../alpaca-combined zh-models/7B\n",
-        "!mv llama.cpp/zh-models/7B/tokenizer.model llama.cpp/zh-models/\n",
-        "!ls llama.cpp/zh-models/"
-      ],
-      "metadata": {
-        "id": "5KgnFVStRjio",
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "outputId": "09ba7058-e2fb-4ae1-8539-62228df6ea09"
-      },
-      "execution_count": 6,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "7B  tokenizer.model\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!cd llama.cpp && python convert.py zh-models/7B/"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "NUHeoTMQS1AQ",
-        "outputId": "356f9e70-d05d-42d3-ed8c-fc052e11a855"
-      },
-      "execution_count": 7,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Loading model file zh-models/7B/consolidated.00.pth\n",
-            "Loading vocab file zh-models/tokenizer.model\n",
-            "Writing vocab...\n",
-            "[1/291] Writing tensor tok_embeddings.weight, size 49954 x 4096...\n",
-            "[2/291] Writing tensor norm.weight, size 4096...\n",
-            "[3/291] Writing tensor output.weight, size 49954 x 4096...\n",
-            "[4/291] Writing tensor layers.0.attention.wq.weight, size 4096 x 4096...\n",
-            "[5/291] Writing tensor layers.0.attention.wk.weight, size 4096 x 4096...\n",
-            "[6/291] Writing tensor layers.0.attention.wv.weight, size 4096 x 4096...\n",
-            "[7/291] Writing tensor layers.0.attention.wo.weight, size 4096 x 4096...\n",
-            "[8/291] Writing tensor layers.0.attention_norm.weight, size 4096...\n",
-            "[9/291] Writing tensor layers.0.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[10/291] Writing tensor layers.0.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[11/291] Writing tensor layers.0.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[12/291] Writing tensor layers.0.ffn_norm.weight, size 4096...\n",
-            "[13/291] Writing tensor layers.1.attention.wq.weight, size 4096 x 4096...\n",
-            "[14/291] Writing tensor layers.1.attention.wk.weight, size 4096 x 4096...\n",
-            "[15/291] Writing tensor layers.1.attention.wv.weight, size 4096 x 4096...\n",
-            "[16/291] Writing tensor layers.1.attention.wo.weight, size 4096 x 4096...\n",
-            "[17/291] Writing tensor layers.1.attention_norm.weight, size 4096...\n",
-            "[18/291] Writing tensor layers.1.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[19/291] Writing tensor layers.1.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[20/291] Writing tensor layers.1.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[21/291] Writing tensor layers.1.ffn_norm.weight, size 4096...\n",
-            "[22/291] Writing tensor layers.2.attention.wq.weight, size 4096 x 4096...\n",
-            "[23/291] Writing tensor layers.2.attention.wk.weight, size 4096 x 4096...\n",
-            "[24/291] Writing tensor layers.2.attention.wv.weight, size 4096 x 4096...\n",
-            "[25/291] Writing tensor layers.2.attention.wo.weight, size 4096 x 4096...\n",
-            "[26/291] Writing tensor layers.2.attention_norm.weight, size 4096...\n",
-            "[27/291] Writing tensor layers.2.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[28/291] Writing tensor layers.2.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[29/291] Writing tensor layers.2.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[30/291] Writing tensor layers.2.ffn_norm.weight, size 4096...\n",
-            "[31/291] Writing tensor layers.3.attention.wq.weight, size 4096 x 4096...\n",
-            "[32/291] Writing tensor layers.3.attention.wk.weight, size 4096 x 4096...\n",
-            "[33/291] Writing tensor layers.3.attention.wv.weight, size 4096 x 4096...\n",
-            "[34/291] Writing tensor layers.3.attention.wo.weight, size 4096 x 4096...\n",
-            "[35/291] Writing tensor layers.3.attention_norm.weight, size 4096...\n",
-            "[36/291] Writing tensor layers.3.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[37/291] Writing tensor layers.3.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[38/291] Writing tensor layers.3.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[39/291] Writing tensor layers.3.ffn_norm.weight, size 4096...\n",
-            "[40/291] Writing tensor layers.4.attention.wq.weight, size 4096 x 4096...\n",
-            "[41/291] Writing tensor layers.4.attention.wk.weight, size 4096 x 4096...\n",
-            "[42/291] Writing tensor layers.4.attention.wv.weight, size 4096 x 4096...\n",
-            "[43/291] Writing tensor layers.4.attention.wo.weight, size 4096 x 4096...\n",
-            "[44/291] Writing tensor layers.4.attention_norm.weight, size 4096...\n",
-            "[45/291] Writing tensor layers.4.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[46/291] Writing tensor layers.4.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[47/291] Writing tensor layers.4.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[48/291] Writing tensor layers.4.ffn_norm.weight, size 4096...\n",
-            "[49/291] Writing tensor layers.5.attention.wq.weight, size 4096 x 4096...\n",
-            "[50/291] Writing tensor layers.5.attention.wk.weight, size 4096 x 4096...\n",
-            "[51/291] Writing tensor layers.5.attention.wv.weight, size 4096 x 4096...\n",
-            "[52/291] Writing tensor layers.5.attention.wo.weight, size 4096 x 4096...\n",
-            "[53/291] Writing tensor layers.5.attention_norm.weight, size 4096...\n",
-            "[54/291] Writing tensor layers.5.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[55/291] Writing tensor layers.5.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[56/291] Writing tensor layers.5.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[57/291] Writing tensor layers.5.ffn_norm.weight, size 4096...\n",
-            "[58/291] Writing tensor layers.6.attention.wq.weight, size 4096 x 4096...\n",
-            "[59/291] Writing tensor layers.6.attention.wk.weight, size 4096 x 4096...\n",
-            "[60/291] Writing tensor layers.6.attention.wv.weight, size 4096 x 4096...\n",
-            "[61/291] Writing tensor layers.6.attention.wo.weight, size 4096 x 4096...\n",
-            "[62/291] Writing tensor layers.6.attention_norm.weight, size 4096...\n",
-            "[63/291] Writing tensor layers.6.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[64/291] Writing tensor layers.6.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[65/291] Writing tensor layers.6.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[66/291] Writing tensor layers.6.ffn_norm.weight, size 4096...\n",
-            "[67/291] Writing tensor layers.7.attention.wq.weight, size 4096 x 4096...\n",
-            "[68/291] Writing tensor layers.7.attention.wk.weight, size 4096 x 4096...\n",
-            "[69/291] Writing tensor layers.7.attention.wv.weight, size 4096 x 4096...\n",
-            "[70/291] Writing tensor layers.7.attention.wo.weight, size 4096 x 4096...\n",
-            "[71/291] Writing tensor layers.7.attention_norm.weight, size 4096...\n",
-            "[72/291] Writing tensor layers.7.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[73/291] Writing tensor layers.7.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[74/291] Writing tensor layers.7.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[75/291] Writing tensor layers.7.ffn_norm.weight, size 4096...\n",
-            "[76/291] Writing tensor layers.8.attention.wq.weight, size 4096 x 4096...\n",
-            "[77/291] Writing tensor layers.8.attention.wk.weight, size 4096 x 4096...\n",
-            "[78/291] Writing tensor layers.8.attention.wv.weight, size 4096 x 4096...\n",
-            "[79/291] Writing tensor layers.8.attention.wo.weight, size 4096 x 4096...\n",
-            "[80/291] Writing tensor layers.8.attention_norm.weight, size 4096...\n",
-            "[81/291] Writing tensor layers.8.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[82/291] Writing tensor layers.8.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[83/291] Writing tensor layers.8.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[84/291] Writing tensor layers.8.ffn_norm.weight, size 4096...\n",
-            "[85/291] Writing tensor layers.9.attention.wq.weight, size 4096 x 4096...\n",
-            "[86/291] Writing tensor layers.9.attention.wk.weight, size 4096 x 4096...\n",
-            "[87/291] Writing tensor layers.9.attention.wv.weight, size 4096 x 4096...\n",
-            "[88/291] Writing tensor layers.9.attention.wo.weight, size 4096 x 4096...\n",
-            "[89/291] Writing tensor layers.9.attention_norm.weight, size 4096...\n",
-            "[90/291] Writing tensor layers.9.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[91/291] Writing tensor layers.9.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[92/291] Writing tensor layers.9.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[93/291] Writing tensor layers.9.ffn_norm.weight, size 4096...\n",
-            "[94/291] Writing tensor layers.10.attention.wq.weight, size 4096 x 4096...\n",
-            "[95/291] Writing tensor layers.10.attention.wk.weight, size 4096 x 4096...\n",
-            "[96/291] Writing tensor layers.10.attention.wv.weight, size 4096 x 4096...\n",
-            "[97/291] Writing tensor layers.10.attention.wo.weight, size 4096 x 4096...\n",
-            "[98/291] Writing tensor layers.10.attention_norm.weight, size 4096...\n",
-            "[99/291] Writing tensor layers.10.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[100/291] Writing tensor layers.10.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[101/291] Writing tensor layers.10.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[102/291] Writing tensor layers.10.ffn_norm.weight, size 4096...\n",
-            "[103/291] Writing tensor layers.11.attention.wq.weight, size 4096 x 4096...\n",
-            "[104/291] Writing tensor layers.11.attention.wk.weight, size 4096 x 4096...\n",
-            "[105/291] Writing tensor layers.11.attention.wv.weight, size 4096 x 4096...\n",
-            "[106/291] Writing tensor layers.11.attention.wo.weight, size 4096 x 4096...\n",
-            "[107/291] Writing tensor layers.11.attention_norm.weight, size 4096...\n",
-            "[108/291] Writing tensor layers.11.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[109/291] Writing tensor layers.11.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[110/291] Writing tensor layers.11.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[111/291] Writing tensor layers.11.ffn_norm.weight, size 4096...\n",
-            "[112/291] Writing tensor layers.12.attention.wq.weight, size 4096 x 4096...\n",
-            "[113/291] Writing tensor layers.12.attention.wk.weight, size 4096 x 4096...\n",
-            "[114/291] Writing tensor layers.12.attention.wv.weight, size 4096 x 4096...\n",
-            "[115/291] Writing tensor layers.12.attention.wo.weight, size 4096 x 4096...\n",
-            "[116/291] Writing tensor layers.12.attention_norm.weight, size 4096...\n",
-            "[117/291] Writing tensor layers.12.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[118/291] Writing tensor layers.12.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[119/291] Writing tensor layers.12.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[120/291] Writing tensor layers.12.ffn_norm.weight, size 4096...\n",
-            "[121/291] Writing tensor layers.13.attention.wq.weight, size 4096 x 4096...\n",
-            "[122/291] Writing tensor layers.13.attention.wk.weight, size 4096 x 4096...\n",
-            "[123/291] Writing tensor layers.13.attention.wv.weight, size 4096 x 4096...\n",
-            "[124/291] Writing tensor layers.13.attention.wo.weight, size 4096 x 4096...\n",
-            "[125/291] Writing tensor layers.13.attention_norm.weight, size 4096...\n",
-            "[126/291] Writing tensor layers.13.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[127/291] Writing tensor layers.13.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[128/291] Writing tensor layers.13.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[129/291] Writing tensor layers.13.ffn_norm.weight, size 4096...\n",
-            "[130/291] Writing tensor layers.14.attention.wq.weight, size 4096 x 4096...\n",
-            "[131/291] Writing tensor layers.14.attention.wk.weight, size 4096 x 4096...\n",
-            "[132/291] Writing tensor layers.14.attention.wv.weight, size 4096 x 4096...\n",
-            "[133/291] Writing tensor layers.14.attention.wo.weight, size 4096 x 4096...\n",
-            "[134/291] Writing tensor layers.14.attention_norm.weight, size 4096...\n",
-            "[135/291] Writing tensor layers.14.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[136/291] Writing tensor layers.14.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[137/291] Writing tensor layers.14.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[138/291] Writing tensor layers.14.ffn_norm.weight, size 4096...\n",
-            "[139/291] Writing tensor layers.15.attention.wq.weight, size 4096 x 4096...\n",
-            "[140/291] Writing tensor layers.15.attention.wk.weight, size 4096 x 4096...\n",
-            "[141/291] Writing tensor layers.15.attention.wv.weight, size 4096 x 4096...\n",
-            "[142/291] Writing tensor layers.15.attention.wo.weight, size 4096 x 4096...\n",
-            "[143/291] Writing tensor layers.15.attention_norm.weight, size 4096...\n",
-            "[144/291] Writing tensor layers.15.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[145/291] Writing tensor layers.15.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[146/291] Writing tensor layers.15.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[147/291] Writing tensor layers.15.ffn_norm.weight, size 4096...\n",
-            "[148/291] Writing tensor layers.16.attention.wq.weight, size 4096 x 4096...\n",
-            "[149/291] Writing tensor layers.16.attention.wk.weight, size 4096 x 4096...\n",
-            "[150/291] Writing tensor layers.16.attention.wv.weight, size 4096 x 4096...\n",
-            "[151/291] Writing tensor layers.16.attention.wo.weight, size 4096 x 4096...\n",
-            "[152/291] Writing tensor layers.16.attention_norm.weight, size 4096...\n",
-            "[153/291] Writing tensor layers.16.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[154/291] Writing tensor layers.16.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[155/291] Writing tensor layers.16.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[156/291] Writing tensor layers.16.ffn_norm.weight, size 4096...\n",
-            "[157/291] Writing tensor layers.17.attention.wq.weight, size 4096 x 4096...\n",
-            "[158/291] Writing tensor layers.17.attention.wk.weight, size 4096 x 4096...\n",
-            "[159/291] Writing tensor layers.17.attention.wv.weight, size 4096 x 4096...\n",
-            "[160/291] Writing tensor layers.17.attention.wo.weight, size 4096 x 4096...\n",
-            "[161/291] Writing tensor layers.17.attention_norm.weight, size 4096...\n",
-            "[162/291] Writing tensor layers.17.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[163/291] Writing tensor layers.17.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[164/291] Writing tensor layers.17.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[165/291] Writing tensor layers.17.ffn_norm.weight, size 4096...\n",
-            "[166/291] Writing tensor layers.18.attention.wq.weight, size 4096 x 4096...\n",
-            "[167/291] Writing tensor layers.18.attention.wk.weight, size 4096 x 4096...\n",
-            "[168/291] Writing tensor layers.18.attention.wv.weight, size 4096 x 4096...\n",
-            "[169/291] Writing tensor layers.18.attention.wo.weight, size 4096 x 4096...\n",
-            "[170/291] Writing tensor layers.18.attention_norm.weight, size 4096...\n",
-            "[171/291] Writing tensor layers.18.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[172/291] Writing tensor layers.18.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[173/291] Writing tensor layers.18.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[174/291] Writing tensor layers.18.ffn_norm.weight, size 4096...\n",
-            "[175/291] Writing tensor layers.19.attention.wq.weight, size 4096 x 4096...\n",
-            "[176/291] Writing tensor layers.19.attention.wk.weight, size 4096 x 4096...\n",
-            "[177/291] Writing tensor layers.19.attention.wv.weight, size 4096 x 4096...\n",
-            "[178/291] Writing tensor layers.19.attention.wo.weight, size 4096 x 4096...\n",
-            "[179/291] Writing tensor layers.19.attention_norm.weight, size 4096...\n",
-            "[180/291] Writing tensor layers.19.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[181/291] Writing tensor layers.19.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[182/291] Writing tensor layers.19.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[183/291] Writing tensor layers.19.ffn_norm.weight, size 4096...\n",
-            "[184/291] Writing tensor layers.20.attention.wq.weight, size 4096 x 4096...\n",
-            "[185/291] Writing tensor layers.20.attention.wk.weight, size 4096 x 4096...\n",
-            "[186/291] Writing tensor layers.20.attention.wv.weight, size 4096 x 4096...\n",
-            "[187/291] Writing tensor layers.20.attention.wo.weight, size 4096 x 4096...\n",
-            "[188/291] Writing tensor layers.20.attention_norm.weight, size 4096...\n",
-            "[189/291] Writing tensor layers.20.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[190/291] Writing tensor layers.20.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[191/291] Writing tensor layers.20.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[192/291] Writing tensor layers.20.ffn_norm.weight, size 4096...\n",
-            "[193/291] Writing tensor layers.21.attention.wq.weight, size 4096 x 4096...\n",
-            "[194/291] Writing tensor layers.21.attention.wk.weight, size 4096 x 4096...\n",
-            "[195/291] Writing tensor layers.21.attention.wv.weight, size 4096 x 4096...\n",
-            "[196/291] Writing tensor layers.21.attention.wo.weight, size 4096 x 4096...\n",
-            "[197/291] Writing tensor layers.21.attention_norm.weight, size 4096...\n",
-            "[198/291] Writing tensor layers.21.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[199/291] Writing tensor layers.21.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[200/291] Writing tensor layers.21.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[201/291] Writing tensor layers.21.ffn_norm.weight, size 4096...\n",
-            "[202/291] Writing tensor layers.22.attention.wq.weight, size 4096 x 4096...\n",
-            "[203/291] Writing tensor layers.22.attention.wk.weight, size 4096 x 4096...\n",
-            "[204/291] Writing tensor layers.22.attention.wv.weight, size 4096 x 4096...\n",
-            "[205/291] Writing tensor layers.22.attention.wo.weight, size 4096 x 4096...\n",
-            "[206/291] Writing tensor layers.22.attention_norm.weight, size 4096...\n",
-            "[207/291] Writing tensor layers.22.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[208/291] Writing tensor layers.22.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[209/291] Writing tensor layers.22.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[210/291] Writing tensor layers.22.ffn_norm.weight, size 4096...\n",
-            "[211/291] Writing tensor layers.23.attention.wq.weight, size 4096 x 4096...\n",
-            "[212/291] Writing tensor layers.23.attention.wk.weight, size 4096 x 4096...\n",
-            "[213/291] Writing tensor layers.23.attention.wv.weight, size 4096 x 4096...\n",
-            "[214/291] Writing tensor layers.23.attention.wo.weight, size 4096 x 4096...\n",
-            "[215/291] Writing tensor layers.23.attention_norm.weight, size 4096...\n",
-            "[216/291] Writing tensor layers.23.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[217/291] Writing tensor layers.23.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[218/291] Writing tensor layers.23.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[219/291] Writing tensor layers.23.ffn_norm.weight, size 4096...\n",
-            "[220/291] Writing tensor layers.24.attention.wq.weight, size 4096 x 4096...\n",
-            "[221/291] Writing tensor layers.24.attention.wk.weight, size 4096 x 4096...\n",
-            "[222/291] Writing tensor layers.24.attention.wv.weight, size 4096 x 4096...\n",
-            "[223/291] Writing tensor layers.24.attention.wo.weight, size 4096 x 4096...\n",
-            "[224/291] Writing tensor layers.24.attention_norm.weight, size 4096...\n",
-            "[225/291] Writing tensor layers.24.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[226/291] Writing tensor layers.24.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[227/291] Writing tensor layers.24.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[228/291] Writing tensor layers.24.ffn_norm.weight, size 4096...\n",
-            "[229/291] Writing tensor layers.25.attention.wq.weight, size 4096 x 4096...\n",
-            "[230/291] Writing tensor layers.25.attention.wk.weight, size 4096 x 4096...\n",
-            "[231/291] Writing tensor layers.25.attention.wv.weight, size 4096 x 4096...\n",
-            "[232/291] Writing tensor layers.25.attention.wo.weight, size 4096 x 4096...\n",
-            "[233/291] Writing tensor layers.25.attention_norm.weight, size 4096...\n",
-            "[234/291] Writing tensor layers.25.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[235/291] Writing tensor layers.25.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[236/291] Writing tensor layers.25.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[237/291] Writing tensor layers.25.ffn_norm.weight, size 4096...\n",
-            "[238/291] Writing tensor layers.26.attention.wq.weight, size 4096 x 4096...\n",
-            "[239/291] Writing tensor layers.26.attention.wk.weight, size 4096 x 4096...\n",
-            "[240/291] Writing tensor layers.26.attention.wv.weight, size 4096 x 4096...\n",
-            "[241/291] Writing tensor layers.26.attention.wo.weight, size 4096 x 4096...\n",
-            "[242/291] Writing tensor layers.26.attention_norm.weight, size 4096...\n",
-            "[243/291] Writing tensor layers.26.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[244/291] Writing tensor layers.26.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[245/291] Writing tensor layers.26.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[246/291] Writing tensor layers.26.ffn_norm.weight, size 4096...\n",
-            "[247/291] Writing tensor layers.27.attention.wq.weight, size 4096 x 4096...\n",
-            "[248/291] Writing tensor layers.27.attention.wk.weight, size 4096 x 4096...\n",
-            "[249/291] Writing tensor layers.27.attention.wv.weight, size 4096 x 4096...\n",
-            "[250/291] Writing tensor layers.27.attention.wo.weight, size 4096 x 4096...\n",
-            "[251/291] Writing tensor layers.27.attention_norm.weight, size 4096...\n",
-            "[252/291] Writing tensor layers.27.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[253/291] Writing tensor layers.27.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[254/291] Writing tensor layers.27.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[255/291] Writing tensor layers.27.ffn_norm.weight, size 4096...\n",
-            "[256/291] Writing tensor layers.28.attention.wq.weight, size 4096 x 4096...\n",
-            "[257/291] Writing tensor layers.28.attention.wk.weight, size 4096 x 4096...\n",
-            "[258/291] Writing tensor layers.28.attention.wv.weight, size 4096 x 4096...\n",
-            "[259/291] Writing tensor layers.28.attention.wo.weight, size 4096 x 4096...\n",
-            "[260/291] Writing tensor layers.28.attention_norm.weight, size 4096...\n",
-            "[261/291] Writing tensor layers.28.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[262/291] Writing tensor layers.28.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[263/291] Writing tensor layers.28.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[264/291] Writing tensor layers.28.ffn_norm.weight, size 4096...\n",
-            "[265/291] Writing tensor layers.29.attention.wq.weight, size 4096 x 4096...\n",
-            "[266/291] Writing tensor layers.29.attention.wk.weight, size 4096 x 4096...\n",
-            "[267/291] Writing tensor layers.29.attention.wv.weight, size 4096 x 4096...\n",
-            "[268/291] Writing tensor layers.29.attention.wo.weight, size 4096 x 4096...\n",
-            "[269/291] Writing tensor layers.29.attention_norm.weight, size 4096...\n",
-            "[270/291] Writing tensor layers.29.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[271/291] Writing tensor layers.29.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[272/291] Writing tensor layers.29.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[273/291] Writing tensor layers.29.ffn_norm.weight, size 4096...\n",
-            "[274/291] Writing tensor layers.30.attention.wq.weight, size 4096 x 4096...\n",
-            "[275/291] Writing tensor layers.30.attention.wk.weight, size 4096 x 4096...\n",
-            "[276/291] Writing tensor layers.30.attention.wv.weight, size 4096 x 4096...\n",
-            "[277/291] Writing tensor layers.30.attention.wo.weight, size 4096 x 4096...\n",
-            "[278/291] Writing tensor layers.30.attention_norm.weight, size 4096...\n",
-            "[279/291] Writing tensor layers.30.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[280/291] Writing tensor layers.30.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[281/291] Writing tensor layers.30.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[282/291] Writing tensor layers.30.ffn_norm.weight, size 4096...\n",
-            "[283/291] Writing tensor layers.31.attention.wq.weight, size 4096 x 4096...\n",
-            "[284/291] Writing tensor layers.31.attention.wk.weight, size 4096 x 4096...\n",
-            "[285/291] Writing tensor layers.31.attention.wv.weight, size 4096 x 4096...\n",
-            "[286/291] Writing tensor layers.31.attention.wo.weight, size 4096 x 4096...\n",
-            "[287/291] Writing tensor layers.31.attention_norm.weight, size 4096...\n",
-            "[288/291] Writing tensor layers.31.feed_forward.w1.weight, size 11008 x 4096...\n",
-            "[289/291] Writing tensor layers.31.feed_forward.w2.weight, size 4096 x 11008...\n",
-            "[290/291] Writing tensor layers.31.feed_forward.w3.weight, size 11008 x 4096...\n",
-            "[291/291] Writing tensor layers.31.ffn_norm.weight, size 4096...\n",
-            "Wrote zh-models/7B/ggml-model-f16.bin\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "### 将FP16模型量化为4-bit\n",
-        "\n",
-        "我们进一步将FP16模型转换为4-bit量化模型。"
-      ],
-      "metadata": {
-        "id": "hEZEJAVYCHkc"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!cd llama.cpp && ./quantize ./zh-models/7B/ggml-model-f16.bin ./zh-models/7B/ggml-model-q4_0.bin 2"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "2xyais7OUVDI",
-        "outputId": "99b4154e-1370-4240-c06b-69ff2f49ee37"
-      },
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "llama.cpp: loading model from ./zh-models/7B/ggml-model-f16.bin\n",
-            "llama.cpp: saving model to ./zh-models/7B/ggml-model-q4_0.bin\n",
-            "[1/291]                tok_embeddings.weight - [4096 x 49954], type =    f16, quantizing .. size =   390.27 MB ->   121.96 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[2/291]                          norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[3/291]                        output.weight - [4096 x 49954], type =    f16, quantizing .. size =   390.27 MB ->   121.96 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[4/291]         layers.0.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.071 0.103 0.137 0.158 0.137 0.103 0.071 0.046 0.028 0.016 0.021 \n",
-            "[5/291]         layers.0.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.016 0.027 0.045 0.071 0.104 0.138 0.158 0.139 0.104 0.071 0.045 0.027 0.016 0.021 \n",
-            "[6/291]         layers.0.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.018 0.032 0.051 0.076 0.103 0.128 0.141 0.128 0.103 0.075 0.051 0.032 0.019 0.022 \n",
-            "[7/291]         layers.0.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.072 0.105 0.136 0.151 0.136 0.105 0.072 0.046 0.028 0.016 0.021 \n",
-            "[8/291]       layers.0.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[9/291]      layers.0.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[10/291]      layers.0.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[11/291]      layers.0.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[12/291]             layers.0.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[13/291]         layers.1.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.051 0.077 0.104 0.127 0.137 0.127 0.104 0.077 0.051 0.032 0.019 0.022 \n",
-            "[14/291]         layers.1.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.018 0.032 0.051 0.076 0.104 0.128 0.138 0.128 0.104 0.077 0.051 0.032 0.018 0.022 \n",
-            "[15/291]         layers.1.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.018 0.031 0.051 0.076 0.104 0.129 0.139 0.129 0.104 0.076 0.051 0.031 0.018 0.021 \n",
-            "[16/291]         layers.1.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.071 0.104 0.137 0.154 0.137 0.104 0.071 0.046 0.028 0.016 0.021 \n",
-            "[17/291]       layers.1.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[18/291]      layers.1.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[19/291]      layers.1.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[20/291]      layers.1.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[21/291]             layers.1.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[22/291]         layers.2.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[23/291]         layers.2.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.051 0.076 0.104 0.127 0.137 0.127 0.104 0.077 0.051 0.032 0.019 0.022 \n",
-            "[24/291]         layers.2.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.136 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[25/291]         layers.2.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
-            "[26/291]       layers.2.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[27/291]      layers.2.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[28/291]      layers.2.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[29/291]      layers.2.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[30/291]             layers.2.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[31/291]         layers.3.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
-            "[32/291]         layers.3.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.136 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
-            "[33/291]         layers.3.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[34/291]         layers.3.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[35/291]       layers.3.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[36/291]      layers.3.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[37/291]      layers.3.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[38/291]      layers.3.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[39/291]             layers.3.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[40/291]         layers.4.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[41/291]         layers.4.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
-            "[42/291]         layers.4.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[43/291]         layers.4.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[44/291]       layers.4.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[45/291]      layers.4.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[46/291]      layers.4.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[47/291]      layers.4.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[48/291]             layers.4.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[49/291]         layers.5.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[50/291]         layers.5.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[51/291]         layers.5.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[52/291]         layers.5.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[53/291]       layers.5.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[54/291]      layers.5.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[55/291]      layers.5.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[56/291]      layers.5.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[57/291]             layers.5.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[58/291]         layers.6.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[59/291]         layers.6.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[60/291]         layers.6.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[61/291]         layers.6.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[62/291]       layers.6.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[63/291]      layers.6.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[64/291]      layers.6.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[65/291]      layers.6.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[66/291]             layers.6.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[67/291]         layers.7.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[68/291]         layers.7.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[69/291]         layers.7.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[70/291]         layers.7.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[71/291]       layers.7.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[72/291]      layers.7.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[73/291]      layers.7.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[74/291]      layers.7.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[75/291]             layers.7.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[76/291]         layers.8.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[77/291]         layers.8.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n",
-            "[78/291]         layers.8.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[79/291]         layers.8.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[80/291]       layers.8.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[81/291]      layers.8.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[82/291]      layers.8.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n",
-            "[83/291]      layers.8.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[84/291]             layers.8.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[85/291]         layers.9.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[86/291]         layers.9.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[87/291]         layers.9.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[88/291]         layers.9.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[89/291]       layers.9.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[90/291]      layers.9.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[91/291]      layers.9.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[92/291]      layers.9.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[93/291]             layers.9.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[94/291]        layers.10.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[95/291]        layers.10.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[96/291]        layers.10.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[97/291]        layers.10.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[98/291]      layers.10.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[99/291]     layers.10.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[100/291]     layers.10.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[101/291]     layers.10.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[102/291]            layers.10.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[103/291]        layers.11.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[104/291]        layers.11.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[105/291]        layers.11.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[106/291]        layers.11.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[107/291]      layers.11.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[108/291]     layers.11.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[109/291]     layers.11.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
-            "[110/291]     layers.11.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[111/291]            layers.11.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[112/291]        layers.12.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[113/291]        layers.12.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[114/291]        layers.12.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[115/291]        layers.12.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[116/291]      layers.12.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[117/291]     layers.12.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[118/291]     layers.12.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[119/291]     layers.12.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[120/291]            layers.12.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[121/291]        layers.13.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[122/291]        layers.13.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n",
-            "[123/291]        layers.13.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[124/291]        layers.13.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[125/291]      layers.13.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[126/291]     layers.13.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[127/291]     layers.13.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n",
-            "[128/291]     layers.13.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[129/291]            layers.13.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[130/291]        layers.14.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[131/291]        layers.14.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[132/291]        layers.14.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[133/291]        layers.14.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[134/291]      layers.14.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[135/291]     layers.14.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[136/291]     layers.14.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[137/291]     layers.14.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[138/291]            layers.14.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[139/291]        layers.15.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[140/291]        layers.15.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[141/291]        layers.15.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[142/291]        layers.15.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[143/291]      layers.15.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[144/291]     layers.15.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[145/291]     layers.15.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[146/291]     layers.15.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[147/291]            layers.15.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[148/291]        layers.16.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[149/291]        layers.16.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[150/291]        layers.16.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[151/291]        layers.16.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[152/291]      layers.16.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[153/291]     layers.16.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[154/291]     layers.16.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n",
-            "[155/291]     layers.16.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[156/291]            layers.16.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[157/291]        layers.17.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[158/291]        layers.17.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[159/291]        layers.17.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[160/291]        layers.17.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[161/291]      layers.17.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[162/291]     layers.17.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[163/291]     layers.17.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n",
-            "[164/291]     layers.17.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[165/291]            layers.17.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[166/291]        layers.18.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[167/291]        layers.18.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[168/291]        layers.18.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[169/291]        layers.18.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[170/291]      layers.18.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[171/291]     layers.18.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[172/291]     layers.18.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[173/291]     layers.18.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[174/291]            layers.18.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[175/291]        layers.19.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[176/291]        layers.19.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[177/291]        layers.19.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[178/291]        layers.19.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[179/291]      layers.19.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[180/291]     layers.19.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[181/291]     layers.19.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[182/291]     layers.19.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[183/291]            layers.19.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[184/291]        layers.20.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[185/291]        layers.20.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[186/291]        layers.20.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[187/291]        layers.20.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[188/291]      layers.20.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[189/291]     layers.20.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[190/291]     layers.20.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[191/291]     layers.20.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[192/291]            layers.20.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[193/291]        layers.21.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[194/291]        layers.21.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[195/291]        layers.21.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[196/291]        layers.21.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[197/291]      layers.21.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[198/291]     layers.21.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[199/291]     layers.21.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[200/291]     layers.21.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[201/291]            layers.21.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[202/291]        layers.22.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[203/291]        layers.22.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[204/291]        layers.22.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[205/291]        layers.22.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[206/291]      layers.22.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[207/291]     layers.22.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[208/291]     layers.22.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[209/291]     layers.22.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[210/291]            layers.22.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[211/291]        layers.23.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[212/291]        layers.23.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[213/291]        layers.23.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[214/291]        layers.23.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[215/291]      layers.23.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[216/291]     layers.23.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[217/291]     layers.23.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[218/291]     layers.23.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[219/291]            layers.23.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[220/291]        layers.24.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[221/291]        layers.24.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[222/291]        layers.24.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[223/291]        layers.24.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[224/291]      layers.24.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[225/291]     layers.24.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[226/291]     layers.24.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[227/291]     layers.24.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[228/291]            layers.24.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[229/291]        layers.25.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[230/291]        layers.25.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[231/291]        layers.25.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[232/291]        layers.25.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[233/291]      layers.25.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[234/291]     layers.25.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[235/291]     layers.25.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[236/291]     layers.25.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[237/291]            layers.25.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[238/291]        layers.26.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[239/291]        layers.26.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[240/291]        layers.26.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[241/291]        layers.26.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[242/291]      layers.26.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[243/291]     layers.26.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[244/291]     layers.26.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[245/291]     layers.26.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[246/291]            layers.26.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[247/291]        layers.27.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[248/291]        layers.27.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[249/291]        layers.27.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[250/291]        layers.27.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[251/291]      layers.27.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[252/291]     layers.27.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[253/291]     layers.27.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[254/291]     layers.27.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[255/291]            layers.27.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[256/291]        layers.28.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[257/291]        layers.28.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[258/291]        layers.28.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[259/291]        layers.28.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[260/291]      layers.28.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[261/291]     layers.28.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[262/291]     layers.28.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[263/291]     layers.28.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[264/291]            layers.28.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[265/291]        layers.29.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[266/291]        layers.29.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[267/291]        layers.29.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[268/291]        layers.29.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[269/291]      layers.29.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[270/291]     layers.29.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[271/291]     layers.29.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
-            "[272/291]     layers.29.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[273/291]            layers.29.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[274/291]        layers.30.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[275/291]        layers.30.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[276/291]        layers.30.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
-            "[277/291]        layers.30.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[278/291]      layers.30.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[279/291]     layers.30.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[280/291]     layers.30.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.018 0.032 0.051 0.076 0.104 0.128 0.137 0.128 0.104 0.076 0.051 0.032 0.018 0.022 \n",
-            "[281/291]     layers.30.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[282/291]            layers.30.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[283/291]        layers.31.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
-            "[284/291]        layers.31.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
-            "[285/291]        layers.31.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
-            "[286/291]        layers.31.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[287/291]      layers.31.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "[288/291]     layers.31.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[289/291]     layers.31.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.021 0.018 0.031 0.050 0.075 0.104 0.130 0.140 0.130 0.104 0.075 0.050 0.031 0.018 0.021 \n",
-            "[290/291]     layers.31.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "[291/291]            layers.31.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
-            "llama_model_quantize_internal: model size  = 13133.55 MB\n",
-            "llama_model_quantize_internal: quant size  =  4104.93 MB\n",
-            "llama_model_quantize_internal: hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
-            "\n",
-            "main: quantize time = 178732.41 ms\n",
-            "main:    total time = 178732.41 ms\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "### （可选）测试量化模型解码\n",
-        "至此已完成了所有转换步骤。\n",
-        "我们运行一条命令测试一下是否能够正常加载并进行对话。\n",
-        "\n",
-        "FP16和Q4量化文件存放在./llama.cpp/zh-models/7B下，可按需下载使用。"
-      ],
-      "metadata": {
-        "id": "DLkuRAo9Vkb1"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q4_0.bin --color -f ./prompts/alpaca.txt -p \"详细介绍一下北京的名胜古迹：\" -n 512"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "tW-ep1BsVQtG",
-        "outputId": "0706c974-127e-4f21-be6b-d71ea4fb989b"
-      },
-      "execution_count": 10,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "main: seed = 1681467955\n",
-            "llama.cpp: loading model from ./zh-models/7B/ggml-model-q4_0.bin\n",
-            "llama_model_load_internal: format     = ggjt v1 (latest)\n",
-            "llama_model_load_internal: n_vocab    = 49954\n",
-            "llama_model_load_internal: n_ctx      = 512\n",
-            "llama_model_load_internal: n_embd     = 4096\n",
-            "llama_model_load_internal: n_mult     = 256\n",
-            "llama_model_load_internal: n_head     = 32\n",
-            "llama_model_load_internal: n_layer    = 32\n",
-            "llama_model_load_internal: n_rot      = 128\n",
-            "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
-            "llama_model_load_internal: n_ff       = 11008\n",
-            "llama_model_load_internal: n_parts    = 1\n",
-            "llama_model_load_internal: model size = 7B\n",
-            "llama_model_load_internal: ggml ctx size =  59.11 KB\n",
-            "llama_model_load_internal: mem required  = 5896.99 MB (+ 1026.00 MB per state)\n",
-            "llama_init_from_file: kv self size  =  256.00 MB\n",
-            "\n",
-            "system_info: n_threads = 40 / 40 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
-            "sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.100000\n",
-            "generate: n_ctx = 512, n_batch = 8, n_predict = 512, n_keep = 0\n",
-            "\n",
-            "\n",
-            "\u001b[33m 详细介绍一下北京的名胜古迹：\u001b[0m\n",
-            " 故宫：明、清两代皇室，御花园及八达门大街。 宫殿内有大量文物珍品； [end of text]\n",
-            "\n",
-            "llama_print_timings:        load time =   717.01 ms\n",
-            "llama_print_timings:      sample time =    48.97 ms /    32 runs   (    1.53 ms per run)\n",
-            "llama_print_timings: prompt eval time =   680.93 ms /    11 tokens (   61.90 ms per token)\n",
-            "llama_print_timings:        eval time =  4490.00 ms /    31 runs   (  144.84 ms per run)\n",
-            "llama_print_timings:       total time =  5461.05 ms\n"
-          ]
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file

From 11630c43483ad41a621fa29ab40ef2af9cfc8546 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Thu, 15 Jun 2023 17:24:53 +0800
Subject: [PATCH 3/9] update legacy notebook

---
 .gitignore                                    |    2 -
 ...nd_quantize_chinese_llama_and_alpaca.ipynb | 2568 +++++++++++++++++
 ...ert_and_quantize_chinese_alpaca_plus.ipynb | 1171 ++++++++
 .../convert_and_quantize_chinese_llama.ipynb  | 1874 ++++++++++++
 4 files changed, 5613 insertions(+), 2 deletions(-)
 create mode 100644 notebooks/convert_and_quantize_chinese_llama_and_alpaca.ipynb
 create mode 100644 notebooks/legacy/convert_and_quantize_chinese_alpaca_plus.ipynb
 create mode 100644 notebooks/legacy/convert_and_quantize_chinese_llama.ipynb

diff --git a/.gitignore b/.gitignore
index 4d3a240..f0fed7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,2 @@
 .DS_Store
 */.DS_Store
-*.ipynb
-*/*.ipynb
diff --git a/notebooks/convert_and_quantize_chinese_llama_and_alpaca.ipynb b/notebooks/convert_and_quantize_chinese_llama_and_alpaca.ipynb
new file mode 100644
index 0000000..e15bab3
--- /dev/null
+++ b/notebooks/convert_and_quantize_chinese_llama_and_alpaca.ipynb
@@ -0,0 +1,2568 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm",
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# 转换并量化中文LLaMA和Alpaca模型\n",
+        "\n",
+        "项目地址：https://github.com/ymcui/Chinese-LLaMA-Alpaca\n",
+        "\n",
+        "⚠️ 内存消耗提示（确保刷出来的机器RAM大于以下要求）：\n",
+        "- 7B模型：15G+\n",
+        "- 13B模型：18G+\n",
+        "- 33B模型：22G+\n",
+        "\n",
+        "💡 提示和小窍门：\n",
+        "- 免费用户默认的内存只有12G左右，不足以转换模型。**实测选择TPU的话有机会随机出35G内存**，建议多试几次\n",
+        "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n",
+        "- 程序莫名崩掉或断开连接就说明内存爆了\n",
+        "- 如果选了“高RAM”之后内存还是不够大的话，选择以下操作，有的时候会分配出很高内存的机器，祝你好运😄！\n",
+        "    - 可以把GPU或者TPU也选上（虽然不会用到）\n",
+        "    - 选GPU时，Pro(+)用户可选“A100”类型GPU\n",
+        "\n",
+        "*温馨提示：用完之后注意断开运行时，选择满足要求的最低配置即可，避免不必要的计算单元消耗（Pro只给100个计算单元）。*"
+      ],
+      "metadata": {
+        "id": "B1c96_k3MahN"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 安装相关依赖"
+      ],
+      "metadata": {
+        "id": "vScqHD_jMFOV"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "E5WKFJXIL6ZU",
+        "outputId": "a7baeebb-9b74-4d14-93dc-fb1f6e1b3716"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Requirement already satisfied: torch==1.13.1 in /usr/local/lib/python3.10/dist-packages (1.13.1)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (4.5.0)\n",
+            "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (11.7.99)\n",
+            "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (8.5.0.96)\n",
+            "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (11.10.3.66)\n",
+            "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (11.7.99)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==1.13.1) (67.7.2)\n",
+            "Requirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==1.13.1) (0.40.0)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Requirement already satisfied: transformers==4.30.2 in /usr/local/lib/python3.10/dist-packages (4.30.2)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (3.12.0)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (0.15.1)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (1.22.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (23.1)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (6.0)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (2022.10.31)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (2.27.1)\n",
+            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (0.13.3)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (0.3.1)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (4.65.0)\n",
+            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.30.2) (2023.4.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.30.2) (4.5.0)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.30.2) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.30.2) (2022.12.7)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.30.2) (2.0.12)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.30.2) (3.4)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting peft==0.3.0\n",
+            "  Downloading peft-0.3.0-py3-none-any.whl (56 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.8/56.8 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (1.22.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (23.1)\n",
+            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (5.9.5)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (6.0)\n",
+            "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (1.13.1)\n",
+            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (4.30.2)\n",
+            "Collecting accelerate (from peft==0.3.0)\n",
+            "  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.6/227.6 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (4.5.0)\n",
+            "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (11.7.99)\n",
+            "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (8.5.0.96)\n",
+            "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (11.10.3.66)\n",
+            "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (11.7.99)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0) (67.7.2)\n",
+            "Requirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0) (0.40.0)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (3.12.0)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (0.15.1)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (2022.10.31)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (2.27.1)\n",
+            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (0.13.3)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (0.3.1)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (4.65.0)\n",
+            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers->peft==0.3.0) (2023.4.0)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0) (2022.12.7)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0) (2.0.12)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0) (3.4)\n",
+            "Installing collected packages: accelerate, peft\n",
+            "Successfully installed accelerate-0.20.3 peft-0.3.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.99)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install torch==1.13.1\n",
+        "!pip install transformers==4.30.2\n",
+        "!pip install peft==0.3.0\n",
+        "!pip install sentencepiece"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 克隆目录和代码"
+      ],
+      "metadata": {
+        "id": "ygb1xFIMNQKw"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n",
+        "!git clone https://github.com/ggerganov/llama.cpp"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yCEJh7NJNXz9",
+        "outputId": "bfa34a83-a8b9-4e24-e956-83c7313eb448"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Cloning into 'Chinese-LLaMA-Alpaca'...\n",
+            "remote: Enumerating objects: 1407, done.\u001b[K\n",
+            "remote: Counting objects: 100% (599/599), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (257/257), done.\u001b[K\n",
+            "remote: Total 1407 (delta 369), reused 494 (delta 338), pack-reused 808\u001b[K\n",
+            "Receiving objects: 100% (1407/1407), 22.61 MiB | 27.14 MiB/s, done.\n",
+            "Resolving deltas: 100% (831/831), done.\n",
+            "Cloning into 'llama.cpp'...\n",
+            "remote: Enumerating objects: 3618, done.\u001b[K\n",
+            "remote: Counting objects: 100% (1155/1155), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (124/124), done.\u001b[K\n",
+            "remote: Total 3618 (delta 1076), reused 1036 (delta 1031), pack-reused 2463\u001b[K\n",
+            "Receiving objects: 100% (3618/3618), 3.28 MiB | 21.36 MiB/s, done.\n",
+            "Resolving deltas: 100% (2424/2424), done.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 合并模型（以Alpaca-7B为例）\n",
+        "\n",
+        "此处使用的是🤗模型库中提供的基模型（已是HF格式），而不是Facebook官方的LLaMA模型，因此略去将原版LLaMA转换为HF格式的步骤。\n",
+        "**这里直接运行第二步：合并LoRA权重**，生成全量模型权重。可以直接指定🤗模型库的地址，也可以是本地存放地址。\n",
+        "- 基模型：`elinas/llama-7b-hf-transformers-4.29` *（use at your own risk，我们比对过SHA256和正版一致，但你应确保自己有权使用该模型）*\n",
+        "- LoRA模型：`ziqingyang/chinese-alpaca-lora-7b`\n",
+        "   - 如果是Alpaca-Plus模型，记得要同时传入llama和alpaca的lora，教程：[这里](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换#多lora权重合并适用于chinese-alpaca-plus)\n",
+        "- 输出格式：可选pth或者huggingface，这里选择pth，因为后面要用llama.cpp量化\n",
+        "\n",
+        "由于要下载模型，所以需要耐心等待一下，尤其是33B模型。\n",
+        "转换好的模型存放在`alpaca-combined`目录。\n",
+        "如果你不需要量化模型，那么到这一步就结束了，可自行下载或者转存到Google Drive。"
+      ],
+      "metadata": {
+        "id": "nIyxX0DSNsgQ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora_low_mem.py \\\n",
+        "    --base_model 'elinas/llama-7b-hf-transformers-4.29' \\\n",
+        "    --lora_model 'ziqingyang/chinese-alpaca-lora-7b' \\\n",
+        "    --output_type pth \\\n",
+        "    --output_dir alpaca-combined"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5AV4EW5hNhVV",
+        "outputId": "5cb36099-4ca1-403e-c6b5-c8c8441eaa11"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Base model: elinas/llama-7b-hf-transformers-4.29\n",
+            "LoRA model(s) ['ziqingyang/chinese-alpaca-lora-7b']:\n",
+            "Loading ziqingyang/chinese-alpaca-lora-7b\n",
+            "Cannot find lora model on the disk. Downloading lora model from hub...\n",
+            "Fetching 7 files:   0% 0/7 [00:00<?, ?it/s]\n",
+            "Downloading (…)c39d6ac454/README.md: 100% 316/316 [00:00<00:00, 1.93MB/s]\n",
+            "\n",
+            "Downloading (…)/adapter_config.json: 100% 472/472 [00:00<00:00, 3.48MB/s]\n",
+            "\n",
+            "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 661kB/s]\n",
+            "\n",
+            "Downloading (…)ac454/.gitattributes: 100% 1.48k/1.48k [00:00<00:00, 7.92MB/s]\n",
+            "Fetching 7 files:  14% 1/7 [00:00<00:00,  6.42it/s]\n",
+            "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 804kB/s]\n",
+            "\n",
+            "Downloading tokenizer.model:   0% 0.00/758k [00:00<?, ?B/s]\u001b[A\n",
+            "\n",
+            "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 15.6MB/s]\n",
+            "\n",
+            "\n",
+            "Downloading adapter_model.bin:   1% 10.5M/858M [00:00<00:12, 66.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:   2% 21.0M/858M [00:00<00:11, 75.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:   4% 31.5M/858M [00:00<00:10, 78.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:   5% 41.9M/858M [00:00<00:10, 80.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:   6% 52.4M/858M [00:00<00:10, 78.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:   7% 62.9M/858M [00:00<00:10, 77.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:   9% 73.4M/858M [00:00<00:09, 80.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  10% 83.9M/858M [00:01<00:09, 78.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  11% 94.4M/858M [00:01<00:09, 80.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  12% 105M/858M [00:01<00:09, 82.9MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  13% 115M/858M [00:01<00:08, 82.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  15% 126M/858M [00:01<00:08, 84.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  16% 136M/858M [00:01<00:09, 78.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  17% 147M/858M [00:01<00:09, 78.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  18% 157M/858M [00:01<00:08, 80.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  20% 168M/858M [00:02<00:08, 78.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  21% 178M/858M [00:02<00:08, 81.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  22% 189M/858M [00:02<00:08, 78.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  23% 199M/858M [00:02<00:08, 78.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  24% 210M/858M [00:02<00:07, 81.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  26% 220M/858M [00:02<00:07, 81.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  27% 231M/858M [00:02<00:07, 82.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  28% 241M/858M [00:03<00:07, 82.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  29% 252M/858M [00:03<00:07, 83.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  31% 262M/858M [00:03<00:07, 80.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  32% 273M/858M [00:03<00:07, 82.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  33% 283M/858M [00:03<00:06, 82.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  34% 294M/858M [00:03<00:06, 84.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  35% 304M/858M [00:03<00:06, 82.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  37% 315M/858M [00:03<00:06, 79.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  38% 325M/858M [00:04<00:06, 82.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  39% 336M/858M [00:04<00:06, 84.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  40% 346M/858M [00:04<00:05, 85.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  42% 357M/858M [00:04<00:05, 86.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  43% 367M/858M [00:04<00:05, 87.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  44% 377M/858M [00:04<00:05, 87.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  45% 388M/858M [00:04<00:05, 88.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  46% 398M/858M [00:04<00:05, 85.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  48% 409M/858M [00:04<00:05, 85.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  49% 419M/858M [00:05<00:05, 85.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  50% 430M/858M [00:05<00:05, 84.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  51% 440M/858M [00:05<00:04, 85.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  53% 451M/858M [00:05<00:04, 86.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  54% 461M/858M [00:05<00:04, 87.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  55% 472M/858M [00:05<00:04, 87.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  56% 482M/858M [00:05<00:04, 85.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  57% 493M/858M [00:05<00:04, 84.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  59% 503M/858M [00:06<00:04, 85.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  60% 514M/858M [00:06<00:04, 85.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  61% 524M/858M [00:06<00:03, 85.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  62% 535M/858M [00:06<00:03, 86.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  64% 545M/858M [00:06<00:03, 87.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  65% 556M/858M [00:06<00:03, 89.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  66% 566M/858M [00:06<00:03, 89.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  67% 577M/858M [00:06<00:03, 89.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  68% 587M/858M [00:07<00:03, 85.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  70% 598M/858M [00:07<00:03, 86.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  71% 608M/858M [00:07<00:02, 88.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  72% 619M/858M [00:07<00:02, 89.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  73% 629M/858M [00:07<00:02, 85.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  75% 640M/858M [00:07<00:02, 81.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  76% 650M/858M [00:07<00:02, 80.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  77% 661M/858M [00:07<00:02, 81.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  78% 671M/858M [00:08<00:02, 82.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  79% 682M/858M [00:08<00:02, 76.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  81% 692M/858M [00:08<00:02, 79.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  82% 703M/858M [00:08<00:01, 80.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  83% 713M/858M [00:08<00:01, 80.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  84% 724M/858M [00:08<00:01, 83.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  85% 734M/858M [00:08<00:01, 85.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  87% 744M/858M [00:08<00:01, 84.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  88% 755M/858M [00:09<00:01, 83.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  89% 765M/858M [00:09<00:01, 81.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  90% 776M/858M [00:09<00:00, 85.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  92% 786M/858M [00:09<00:00, 86.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  93% 797M/858M [00:09<00:00, 85.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  94% 807M/858M [00:09<00:00, 87.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  95% 818M/858M [00:09<00:00, 86.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  96% 828M/858M [00:09<00:00, 87.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  98% 839M/858M [00:10<00:00, 86.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin:  99% 849M/858M [00:10<00:00, 82.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading adapter_model.bin: 100% 858M/858M [00:10<00:00, 83.3MB/s]\n",
+            "Fetching 7 files: 100% 7/7 [00:10<00:00,  1.50s/it]\n",
+            "Cannot find lora model on the disk. Downloading lora model from hub...\n",
+            "Fetching 11 files:   0% 0/11 [00:00<?, ?it/s]\n",
+            "Downloading (…)a55fdb7a/config.json: 100% 507/507 [00:00<00:00, 2.66MB/s]\n",
+            "\n",
+            "Downloading (…)l-00001-of-00002.bin:   0% 0.00/9.98G [00:00<?, ?B/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   0% 0.00/3.50G [00:00<?, ?B/s]\u001b[A\u001b[A\n",
+            "\n",
+            "\n",
+            "Downloading (…)neration_config.json: 100% 137/137 [00:00<00:00, 814kB/s]\n",
+            "\n",
+            "\n",
+            "\n",
+            "Downloading (…)fdb7a/.gitattributes: 100% 1.48k/1.48k [00:00<00:00, 11.4MB/s]\n",
+            "Fetching 11 files:   9% 1/11 [00:00<00:01,  9.05it/s]\n",
+            "\n",
+            "\n",
+            "Downloading (…)cial_tokens_map.json: 100% 411/411 [00:00<00:00, 1.57MB/s]\n",
+            "\n",
+            "\n",
+            "\n",
+            "Downloading (…)82a55fdb7a/README.md: 100% 8.34k/8.34k [00:00<00:00, 24.1MB/s]\n",
+            "\n",
+            "Downloading (…)l-00001-of-00002.bin:   0% 10.5M/9.98G [00:00<01:36, 103MB/s]\u001b[A\n",
+            "\n",
+            "\n",
+            "Downloading tokenizer.model:   0% 0.00/500k [00:00<?, ?B/s]\u001b[A\u001b[A\u001b[A\n",
+            "\n",
+            "\n",
+            "\n",
+            "Downloading (…)fdb7a/tokenizer.json:   0% 0.00/1.84M [00:00<?, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading tokenizer.model: 100% 500k/500k [00:00<00:00, 11.7MB/s]\n",
+            "Downloading (…)fdb7a/tokenizer.json: 100% 1.84M/1.84M [00:00<00:00, 28.3MB/s]\n",
+            "\n",
+            "Downloading (…)l-00001-of-00002.bin:   0% 41.9M/9.98G [00:00<00:55, 180MB/s]\u001b[A\n",
+            "\n",
+            "\n",
+            "Downloading (…)model.bin.index.json: 100% 26.8k/26.8k [00:00<00:00, 53.1MB/s]\n",
+            "\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   1% 21.0M/3.50G [00:00<00:44, 78.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   1% 73.4M/9.98G [00:00<00:50, 197MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   1% 31.5M/3.50G [00:00<00:47, 73.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   1% 94.4M/9.98G [00:00<00:49, 200MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   1% 41.9M/3.50G [00:00<00:44, 77.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   1% 115M/9.98G [00:00<00:49, 200MB/s] \u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   1% 52.4M/3.50G [00:00<00:42, 80.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   1% 136M/9.98G [00:00<00:49, 200MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   2% 62.9M/3.50G [00:00<00:42, 81.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   2% 168M/9.98G [00:00<00:47, 206MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   2% 73.4M/3.50G [00:00<00:40, 83.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   2% 199M/9.98G [00:00<00:46, 210MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   2% 83.9M/3.50G [00:01<00:39, 85.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "\n",
+            "Downloading (…)okenizer_config.json: 100% 727/727 [00:00<00:00, 3.07MB/s]\n",
+            "\n",
+            "Downloading (…)l-00001-of-00002.bin:   2% 231M/9.98G [00:01<00:45, 212MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   3% 94.4M/3.50G [00:01<00:40, 83.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   3% 262M/9.98G [00:01<00:45, 212MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   3% 105M/3.50G [00:01<00:42, 80.7MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   3% 115M/3.50G [00:01<00:40, 83.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   3% 294M/9.98G [00:01<00:45, 214MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   4% 126M/3.50G [00:01<00:40, 84.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   3% 325M/9.98G [00:01<00:44, 216MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   4% 136M/3.50G [00:01<00:40, 83.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   4% 357M/9.98G [00:01<00:44, 216MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   4% 147M/3.50G [00:01<00:40, 82.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   4% 388M/9.98G [00:01<00:44, 215MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   4% 157M/3.50G [00:01<00:39, 83.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   4% 419M/9.98G [00:02<00:44, 213MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   5% 168M/3.50G [00:02<00:40, 82.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   5% 451M/9.98G [00:02<00:44, 214MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   5% 178M/3.50G [00:02<00:40, 82.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   5% 189M/3.50G [00:02<00:40, 81.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   5% 482M/9.98G [00:02<00:44, 211MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   6% 199M/3.50G [00:02<00:40, 81.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   5% 514M/9.98G [00:02<00:44, 214MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   6% 210M/3.50G [00:02<00:41, 80.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   5% 545M/9.98G [00:02<00:42, 220MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   6% 220M/3.50G [00:02<00:41, 78.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   6% 577M/9.98G [00:02<00:42, 220MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   7% 231M/3.50G [00:02<00:41, 79.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   7% 241M/3.50G [00:02<00:40, 79.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   7% 252M/3.50G [00:03<00:40, 79.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   6% 608M/9.98G [00:03<01:04, 145MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   6% 629M/9.98G [00:03<01:01, 153MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   7% 262M/3.50G [00:03<00:41, 77.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   7% 650M/9.98G [00:03<00:57, 161MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   8% 273M/3.50G [00:03<00:40, 79.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   7% 671M/9.98G [00:03<00:54, 171MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   8% 283M/3.50G [00:03<00:41, 77.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   7% 692M/9.98G [00:03<00:53, 174MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   8% 294M/3.50G [00:03<00:40, 78.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   7% 713M/9.98G [00:03<00:53, 172MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   9% 304M/3.50G [00:03<00:39, 81.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   7% 734M/9.98G [00:03<00:56, 165MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   9% 315M/3.50G [00:03<00:39, 80.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   8% 755M/9.98G [00:03<00:57, 161MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:   9% 325M/3.50G [00:04<00:38, 82.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   8% 776M/9.98G [00:04<00:59, 154MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  10% 336M/3.50G [00:04<00:38, 83.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   8% 797M/9.98G [00:04<00:56, 164MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  10% 346M/3.50G [00:04<00:37, 84.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  10% 357M/3.50G [00:04<00:36, 85.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   8% 818M/9.98G [00:04<01:04, 143MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  10% 367M/3.50G [00:07<04:22, 11.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  11% 377M/3.50G [00:07<03:34, 14.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   8% 839M/9.98G [00:07<07:14, 21.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  11% 398M/3.50G [00:07<02:06, 24.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   9% 860M/9.98G [00:07<05:26, 27.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  12% 409M/3.50G [00:07<01:43, 30.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   9% 881M/9.98G [00:07<04:15, 35.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  12% 419M/3.50G [00:07<01:27, 35.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  12% 430M/3.50G [00:07<01:13, 41.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   9% 902M/9.98G [00:07<03:23, 44.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  13% 440M/3.50G [00:08<01:02, 49.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  13% 451M/3.50G [00:08<00:54, 56.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   9% 923M/9.98G [00:08<02:48, 53.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  13% 461M/3.50G [00:08<00:48, 62.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:   9% 944M/9.98G [00:08<02:23, 63.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  13% 472M/3.50G [00:08<00:46, 65.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  14% 482M/3.50G [00:08<00:43, 69.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  10% 965M/9.98G [00:08<02:05, 72.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  14% 493M/3.50G [00:08<00:41, 71.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  10% 986M/9.98G [00:08<01:55, 77.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  14% 503M/3.50G [00:08<00:39, 76.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  10% 996M/9.98G [00:08<01:52, 79.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  15% 514M/3.50G [00:08<00:38, 77.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  10% 1.01G/9.98G [00:09<01:48, 82.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  15% 524M/3.50G [00:09<00:38, 76.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  10% 1.02G/9.98G [00:09<01:46, 84.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  15% 535M/3.50G [00:09<00:38, 77.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  10% 1.03G/9.98G [00:09<01:42, 87.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  16% 545M/3.50G [00:09<00:36, 81.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  10% 1.04G/9.98G [00:09<01:39, 89.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  16% 556M/3.50G [00:09<00:36, 79.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  11% 1.06G/9.98G [00:09<01:26, 104MB/s] \u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  16% 566M/3.50G [00:09<00:37, 79.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  11% 1.08G/9.98G [00:09<01:13, 121MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  16% 577M/3.50G [00:09<00:36, 79.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  11% 1.11G/9.98G [00:09<00:59, 150MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  17% 587M/3.50G [00:09<00:35, 82.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  11% 1.14G/9.98G [00:12<05:28, 26.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  12% 1.16G/9.98G [00:12<04:13, 34.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  17% 598M/3.50G [00:12<04:14, 11.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  17% 608M/3.50G [00:12<03:09, 15.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  12% 1.18G/9.98G [00:12<03:37, 40.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  18% 629M/3.50G [00:12<01:50, 25.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  19% 650M/3.50G [00:13<01:12, 39.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  12% 1.21G/9.98G [00:13<03:04, 47.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  19% 682M/3.50G [00:13<00:44, 63.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  12% 1.22G/9.98G [00:13<02:49, 51.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  20% 713M/3.50G [00:13<00:31, 88.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  12% 1.23G/9.98G [00:13<02:37, 55.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  21% 734M/3.50G [00:13<00:27, 101MB/s] \u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  12% 1.24G/9.98G [00:13<02:25, 60.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.25G/9.98G [00:13<02:15, 64.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  22% 755M/3.50G [00:13<00:30, 90.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.26G/9.98G [00:13<02:09, 67.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.27G/9.98G [00:13<02:06, 68.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  22% 776M/3.50G [00:14<00:32, 83.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.28G/9.98G [00:14<02:03, 70.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.29G/9.98G [00:14<01:57, 73.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  23% 797M/3.50G [00:14<00:30, 87.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.30G/9.98G [00:14<01:51, 77.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  23% 818M/3.50G [00:14<00:26, 102MB/s] \u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.31G/9.98G [00:14<01:54, 76.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  24% 839M/3.50G [00:14<00:23, 115MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  25% 860M/3.50G [00:14<00:24, 108MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.32G/9.98G [00:14<02:42, 53.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  25% 881M/3.50G [00:14<00:21, 123MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  13% 1.33G/9.98G [00:14<02:30, 57.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  26% 902M/3.50G [00:14<00:19, 135MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  26% 923M/3.50G [00:15<00:17, 148MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  14% 1.35G/9.98G [00:15<01:53, 75.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  27% 944M/3.50G [00:15<00:16, 158MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  14% 1.37G/9.98G [00:15<01:37, 88.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  28% 965M/3.50G [00:15<00:15, 164MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  14% 1.38G/9.98G [00:15<01:40, 85.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  28% 986M/3.50G [00:15<00:15, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  29% 1.01G/3.50G [00:15<00:14, 171MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  14% 1.39G/9.98G [00:15<01:40, 85.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  29% 1.03G/3.50G [00:15<00:13, 177MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  14% 1.41G/9.98G [00:15<01:37, 87.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  30% 1.05G/3.50G [00:15<00:13, 176MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  14% 1.42G/9.98G [00:15<01:38, 86.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  31% 1.07G/3.50G [00:15<00:13, 181MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  14% 1.43G/9.98G [00:15<01:40, 85.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  14% 1.44G/9.98G [00:16<01:39, 86.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  31% 1.09G/3.50G [00:16<00:15, 160MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.45G/9.98G [00:16<01:39, 85.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.46G/9.98G [00:16<01:38, 86.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  32% 1.11G/3.50G [00:16<00:18, 129MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.47G/9.98G [00:16<01:53, 75.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  32% 1.13G/3.50G [00:16<00:21, 110MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.48G/9.98G [00:16<01:54, 74.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.49G/9.98G [00:16<01:50, 76.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  33% 1.15G/3.50G [00:16<00:23, 101MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.50G/9.98G [00:16<01:45, 80.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.51G/9.98G [00:16<01:42, 82.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  34% 1.17G/3.50G [00:17<00:24, 96.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.52G/9.98G [00:17<01:41, 83.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.53G/9.98G [00:17<01:41, 83.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  34% 1.20G/3.50G [00:17<00:24, 93.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  15% 1.54G/9.98G [00:17<01:39, 84.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  34% 1.21G/3.50G [00:17<00:25, 90.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.55G/9.98G [00:17<01:39, 84.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  35% 1.22G/3.50G [00:17<00:25, 90.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.56G/9.98G [00:17<01:40, 83.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  35% 1.23G/3.50G [00:17<00:26, 85.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.57G/9.98G [00:17<01:39, 84.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  35% 1.24G/3.50G [00:17<00:26, 85.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.58G/9.98G [00:17<01:38, 85.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  36% 1.25G/3.50G [00:17<00:26, 86.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.59G/9.98G [00:17<01:37, 85.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  36% 1.26G/3.50G [00:18<00:25, 88.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.60G/9.98G [00:18<01:38, 85.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  36% 1.27G/3.50G [00:18<00:25, 88.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.61G/9.98G [00:18<01:39, 83.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  37% 1.28G/3.50G [00:18<00:25, 88.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.63G/9.98G [00:18<01:38, 85.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  37% 1.29G/3.50G [00:18<00:24, 88.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  16% 1.64G/9.98G [00:18<01:39, 83.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  37% 1.30G/3.50G [00:18<00:25, 88.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.65G/9.98G [00:18<01:36, 85.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  37% 1.31G/3.50G [00:18<00:24, 88.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.66G/9.98G [00:18<01:38, 84.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  38% 1.32G/3.50G [00:18<00:25, 86.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.67G/9.98G [00:18<01:37, 85.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  38% 1.33G/3.50G [00:18<00:24, 87.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.68G/9.98G [00:18<01:37, 85.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  38% 1.34G/3.50G [00:18<00:25, 84.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.69G/9.98G [00:19<01:38, 84.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  39% 1.35G/3.50G [00:19<00:25, 85.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.70G/9.98G [00:19<01:38, 84.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  39% 1.36G/3.50G [00:19<00:25, 83.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.71G/9.98G [00:19<01:37, 84.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  39% 1.37G/3.50G [00:19<00:26, 79.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.72G/9.98G [00:19<01:41, 81.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  40% 1.38G/3.50G [00:19<00:26, 81.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.73G/9.98G [00:19<01:39, 82.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  40% 1.39G/3.50G [00:19<00:25, 82.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  17% 1.74G/9.98G [00:19<01:37, 84.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  40% 1.41G/3.50G [00:19<00:25, 81.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.75G/9.98G [00:19<01:35, 86.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  40% 1.42G/3.50G [00:19<00:24, 83.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.76G/9.98G [00:19<01:33, 87.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  41% 1.43G/3.50G [00:19<00:24, 84.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.77G/9.98G [00:20<01:37, 84.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  41% 1.44G/3.50G [00:20<00:24, 84.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.78G/9.98G [00:20<01:36, 85.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  41% 1.45G/3.50G [00:20<00:23, 85.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.79G/9.98G [00:20<01:34, 86.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  42% 1.46G/3.50G [00:20<00:23, 86.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.80G/9.98G [00:20<01:34, 86.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  42% 1.47G/3.50G [00:20<00:23, 85.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.81G/9.98G [00:20<01:34, 86.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  42% 1.48G/3.50G [00:20<00:23, 84.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.82G/9.98G [00:20<01:36, 84.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  43% 1.49G/3.50G [00:20<00:24, 83.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.84G/9.98G [00:20<01:53, 72.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  43% 1.50G/3.50G [00:20<00:24, 83.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  18% 1.85G/9.98G [00:20<01:49, 74.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  43% 1.51G/3.50G [00:20<00:24, 81.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  43% 1.52G/3.50G [00:21<00:23, 83.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.86G/9.98G [00:21<01:54, 70.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  44% 1.53G/3.50G [00:21<00:23, 83.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.87G/9.98G [00:21<01:47, 75.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  44% 1.54G/3.50G [00:21<00:23, 84.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.88G/9.98G [00:21<01:53, 71.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  44% 1.55G/3.50G [00:21<00:22, 87.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.89G/9.98G [00:21<01:47, 75.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  45% 1.56G/3.50G [00:21<00:22, 88.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.90G/9.98G [00:21<01:41, 79.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  45% 1.57G/3.50G [00:21<00:22, 87.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.91G/9.98G [00:21<01:38, 82.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  45% 1.58G/3.50G [00:21<00:22, 83.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.92G/9.98G [00:21<01:39, 81.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  46% 1.59G/3.50G [00:21<00:23, 80.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.93G/9.98G [00:22<01:47, 74.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  46% 1.60G/3.50G [00:22<00:23, 82.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  19% 1.94G/9.98G [00:22<01:45, 76.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  46% 1.61G/3.50G [00:22<00:22, 84.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 1.95G/9.98G [00:22<01:42, 78.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  46% 1.63G/3.50G [00:22<00:22, 84.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 1.96G/9.98G [00:22<01:41, 78.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  47% 1.64G/3.50G [00:22<00:23, 80.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 1.97G/9.98G [00:22<01:40, 79.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  47% 1.65G/3.50G [00:22<00:22, 83.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 1.98G/9.98G [00:22<01:39, 80.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  47% 1.66G/3.50G [00:22<00:21, 84.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 1.99G/9.98G [00:22<01:37, 81.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  48% 1.67G/3.50G [00:22<00:21, 84.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  48% 1.68G/3.50G [00:22<00:22, 82.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 2.00G/9.98G [00:23<01:41, 78.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  48% 1.69G/3.50G [00:23<00:21, 84.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 2.01G/9.98G [00:23<01:46, 74.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  49% 1.70G/3.50G [00:23<00:22, 81.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 2.02G/9.98G [00:23<01:48, 73.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  49% 1.71G/3.50G [00:23<00:22, 80.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  49% 1.72G/3.50G [00:23<00:21, 81.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 2.03G/9.98G [00:23<02:03, 64.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  49% 1.73G/3.50G [00:23<00:21, 83.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  20% 2.04G/9.98G [00:23<01:53, 69.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  50% 1.74G/3.50G [00:23<00:20, 84.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.06G/9.98G [00:23<01:50, 71.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  50% 1.75G/3.50G [00:23<00:20, 85.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.07G/9.98G [00:23<01:49, 72.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  50% 1.76G/3.50G [00:23<00:19, 87.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.08G/9.98G [00:24<01:45, 75.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  51% 1.77G/3.50G [00:24<00:20, 85.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.09G/9.98G [00:24<01:42, 77.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  51% 1.78G/3.50G [00:24<00:20, 85.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.10G/9.98G [00:24<01:50, 71.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  51% 1.79G/3.50G [00:24<00:19, 85.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  52% 1.80G/3.50G [00:24<00:19, 87.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.11G/9.98G [00:24<01:45, 74.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  52% 1.81G/3.50G [00:24<00:19, 85.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.12G/9.98G [00:24<01:43, 75.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  52% 1.82G/3.50G [00:24<00:19, 84.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.13G/9.98G [00:24<01:47, 73.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  52% 1.84G/3.50G [00:24<00:19, 83.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  21% 2.14G/9.98G [00:24<01:42, 76.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  53% 1.85G/3.50G [00:24<00:20, 81.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.15G/9.98G [00:25<01:45, 73.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  53% 1.86G/3.50G [00:25<00:20, 81.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.16G/9.98G [00:25<01:42, 76.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  53% 1.87G/3.50G [00:25<00:20, 80.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.17G/9.98G [00:25<01:40, 77.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  54% 1.88G/3.50G [00:25<00:19, 82.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.18G/9.98G [00:25<01:38, 79.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  54% 1.89G/3.50G [00:25<00:19, 84.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.19G/9.98G [00:25<01:38, 79.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  54% 1.90G/3.50G [00:25<00:18, 85.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.20G/9.98G [00:25<01:39, 78.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  55% 1.91G/3.50G [00:25<00:19, 82.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.21G/9.98G [00:25<01:41, 76.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  55% 1.92G/3.50G [00:25<00:18, 84.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.22G/9.98G [00:25<01:40, 77.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  55% 1.93G/3.50G [00:25<00:18, 85.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.23G/9.98G [00:26<01:36, 79.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  55% 1.94G/3.50G [00:26<00:19, 81.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  22% 2.24G/9.98G [00:26<01:36, 80.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  56% 1.95G/3.50G [00:26<00:19, 81.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.25G/9.98G [00:26<01:34, 81.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  56% 1.96G/3.50G [00:26<00:18, 82.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.26G/9.98G [00:26<01:36, 79.8MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  56% 1.97G/3.50G [00:26<00:18, 82.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.28G/9.98G [00:26<01:35, 80.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  57% 1.98G/3.50G [00:26<00:18, 82.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.29G/9.98G [00:26<01:38, 78.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  57% 1.99G/3.50G [00:26<00:18, 81.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.30G/9.98G [00:26<01:35, 80.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  57% 2.00G/3.50G [00:26<00:17, 83.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  58% 2.01G/3.50G [00:26<00:17, 84.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.31G/9.98G [00:27<01:39, 77.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  58% 2.02G/3.50G [00:27<00:17, 82.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.32G/9.98G [00:27<01:43, 74.1MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  58% 2.03G/3.50G [00:27<00:18, 77.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.33G/9.98G [00:27<01:49, 69.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  58% 2.04G/3.50G [00:27<00:19, 76.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  23% 2.34G/9.98G [00:27<01:46, 71.5MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  59% 2.06G/3.50G [00:27<00:18, 79.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  24% 2.35G/9.98G [00:27<01:37, 77.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  59% 2.07G/3.50G [00:27<00:17, 82.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  24% 2.37G/9.98G [00:27<01:14, 102MB/s] \u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  59% 2.08G/3.50G [00:27<00:17, 81.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  24% 2.39G/9.98G [00:27<01:00, 125MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  60% 2.09G/3.50G [00:27<00:17, 81.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  24% 2.41G/9.98G [00:27<00:52, 145MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  24% 2.43G/9.98G [00:28<00:46, 162MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  60% 2.10G/3.50G [00:28<00:16, 83.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  25% 2.45G/9.98G [00:28<00:43, 173MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  60% 2.11G/3.50G [00:28<00:16, 85.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  25% 2.47G/9.98G [00:28<00:42, 175MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  25% 2.51G/9.98G [00:28<00:39, 190MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  61% 2.12G/3.50G [00:28<00:24, 56.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  25% 2.54G/9.98G [00:28<00:37, 201MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  61% 2.14G/3.50G [00:28<00:16, 82.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  26% 2.57G/9.98G [00:28<00:36, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  26% 2.59G/9.98G [00:28<00:35, 205MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  62% 2.16G/3.50G [00:28<00:14, 90.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  26% 2.62G/9.98G [00:28<00:34, 212MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  62% 2.17G/3.50G [00:28<00:14, 88.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  62% 2.18G/3.50G [00:29<00:14, 89.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  27% 2.65G/9.98G [00:29<00:34, 214MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  63% 2.19G/3.50G [00:29<00:14, 87.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  27% 2.68G/9.98G [00:29<00:36, 201MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  63% 2.20G/3.50G [00:29<00:14, 86.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  27% 2.71G/9.98G [00:29<00:39, 183MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  63% 2.21G/3.50G [00:29<00:14, 87.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  64% 2.22G/3.50G [00:29<00:14, 85.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  27% 2.73G/9.98G [00:29<00:42, 171MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  64% 2.23G/3.50G [00:29<00:14, 86.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  28% 2.75G/9.98G [00:29<00:46, 156MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  64% 2.24G/3.50G [00:29<00:14, 86.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  28% 2.77G/9.98G [00:29<00:44, 160MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  64% 2.25G/3.50G [00:29<00:14, 85.0MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  28% 2.79G/9.98G [00:29<00:44, 160MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  65% 2.26G/3.50G [00:30<00:14, 85.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  28% 2.81G/9.98G [00:32<04:32, 26.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  65% 2.28G/3.50G [00:32<01:37, 12.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  28% 2.83G/9.98G [00:32<03:25, 34.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  65% 2.29G/3.50G [00:32<01:12, 16.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  66% 2.31G/3.50G [00:32<00:41, 28.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  29% 2.85G/9.98G [00:32<02:55, 40.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  67% 2.33G/3.50G [00:32<00:26, 43.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  29% 2.88G/9.98G [00:32<01:59, 59.6MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  67% 2.35G/3.50G [00:33<00:19, 58.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  29% 2.90G/9.98G [00:33<01:50, 64.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  68% 2.37G/3.50G [00:33<00:17, 63.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  29% 2.93G/9.98G [00:33<01:40, 70.4MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  68% 2.39G/3.50G [00:33<00:15, 71.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  30% 2.95G/9.98G [00:33<01:36, 72.9MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  69% 2.41G/3.50G [00:33<00:14, 73.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  69% 2.42G/3.50G [00:33<00:14, 74.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  30% 2.97G/9.98G [00:33<01:31, 76.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  69% 2.43G/3.50G [00:34<00:13, 76.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  30% 2.98G/9.98G [00:34<01:31, 76.3MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  70% 2.44G/3.50G [00:34<00:13, 78.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  30% 2.99G/9.98G [00:34<01:28, 78.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  30% 3.00G/9.98G [00:34<01:24, 82.7MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  70% 2.46G/3.50G [00:34<00:11, 87.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  30% 3.02G/9.98G [00:34<01:12, 96.0MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  71% 2.49G/3.50G [00:34<00:10, 101MB/s] \u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  30% 3.04G/9.98G [00:34<01:01, 113MB/s] \u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  72% 2.51G/3.50G [00:34<00:08, 119MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  31% 3.06G/9.98G [00:34<00:53, 129MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  72% 2.53G/3.50G [00:34<00:07, 136MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  31% 3.08G/9.98G [00:34<00:48, 143MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  73% 2.55G/3.50G [00:34<00:06, 148MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  31% 3.10G/9.98G [00:34<00:43, 158MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  73% 2.57G/3.50G [00:34<00:05, 158MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  31% 3.12G/9.98G [00:35<00:44, 152MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  74% 2.59G/3.50G [00:35<00:05, 158MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  32% 3.15G/9.98G [00:35<00:57, 119MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  75% 2.61G/3.50G [00:35<00:08, 107MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  32% 3.17G/9.98G [00:35<01:12, 94.2MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  75% 2.63G/3.50G [00:35<00:09, 95.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  32% 3.19G/9.98G [00:35<01:02, 109MB/s] \u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  76% 2.65G/3.50G [00:35<00:07, 108MB/s] \u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  32% 3.21G/9.98G [00:35<00:53, 125MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  76% 2.67G/3.50G [00:35<00:06, 122MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  32% 3.23G/9.98G [00:36<00:48, 139MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  77% 2.69G/3.50G [00:36<00:05, 134MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  33% 3.25G/9.98G [00:36<00:43, 154MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  78% 2.72G/3.50G [00:36<00:05, 147MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  33% 3.27G/9.98G [00:36<00:40, 165MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  78% 2.74G/3.50G [00:36<00:04, 158MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  33% 3.29G/9.98G [00:36<00:39, 170MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  79% 2.76G/3.50G [00:36<00:04, 167MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  33% 3.31G/9.98G [00:36<00:37, 176MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  79% 2.78G/3.50G [00:36<00:04, 174MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  33% 3.33G/9.98G [00:36<00:36, 183MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  80% 2.80G/3.50G [00:36<00:03, 177MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  34% 3.36G/9.98G [00:36<00:35, 186MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  81% 2.82G/3.50G [00:36<00:04, 148MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  34% 3.38G/9.98G [00:36<00:41, 159MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  34% 3.40G/9.98G [00:36<00:39, 168MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  34% 3.42G/9.98G [00:37<00:36, 177MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  81% 2.84G/3.50G [00:37<00:05, 117MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  34% 3.44G/9.98G [00:37<00:36, 177MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  35% 3.46G/9.98G [00:37<00:35, 181MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  82% 2.86G/3.50G [00:37<00:06, 104MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  35% 3.48G/9.98G [00:37<00:37, 173MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  35% 3.51G/9.98G [00:37<00:37, 172MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  82% 2.88G/3.50G [00:37<00:06, 91.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  35% 3.53G/9.98G [00:37<00:39, 164MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  83% 2.90G/3.50G [00:37<00:06, 93.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  36% 3.55G/9.98G [00:37<00:46, 139MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  83% 2.92G/3.50G [00:38<00:06, 91.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  36% 3.58G/9.98G [00:38<00:46, 138MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  84% 2.93G/3.50G [00:38<00:06, 85.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  36% 3.60G/9.98G [00:38<00:46, 138MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  84% 2.94G/3.50G [00:38<00:06, 83.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  84% 2.95G/3.50G [00:38<00:06, 83.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  36% 3.62G/9.98G [00:38<00:45, 140MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  84% 2.96G/3.50G [00:38<00:06, 79.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  36% 3.64G/9.98G [00:38<00:46, 137MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  85% 2.97G/3.50G [00:38<00:06, 81.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  37% 3.66G/9.98G [00:38<00:46, 137MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  85% 2.98G/3.50G [00:38<00:06, 79.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  37% 3.68G/9.98G [00:38<00:47, 132MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  85% 2.99G/3.50G [00:38<00:06, 82.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  86% 3.00G/3.50G [00:39<00:05, 83.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  37% 3.70G/9.98G [00:39<00:50, 123MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  86% 3.01G/3.50G [00:39<00:06, 77.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  37% 3.72G/9.98G [00:39<01:03, 99.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  38% 3.74G/9.98G [00:39<01:00, 103MB/s] \u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  86% 3.02G/3.50G [00:39<00:10, 47.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  38% 3.76G/9.98G [00:39<00:59, 105MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  87% 3.03G/3.50G [00:39<00:09, 48.7MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  38% 3.79G/9.98G [00:39<00:52, 118MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  87% 3.04G/3.50G [00:39<00:08, 56.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  38% 3.81G/9.98G [00:40<00:47, 130MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  87% 3.06G/3.50G [00:40<00:05, 77.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  38% 3.83G/9.98G [00:40<00:42, 145MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  39% 3.85G/9.98G [00:40<00:39, 154MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  88% 3.08G/3.50G [00:40<00:04, 94.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  39% 3.87G/9.98G [00:40<00:37, 161MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  89% 3.10G/3.50G [00:40<00:03, 111MB/s] \u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  39% 3.89G/9.98G [00:40<00:35, 170MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  39% 3.91G/9.98G [00:40<00:34, 178MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  89% 3.12G/3.50G [00:40<00:03, 109MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  39% 3.93G/9.98G [00:40<00:33, 182MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  40% 3.95G/9.98G [00:40<00:31, 190MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  90% 3.15G/3.50G [00:40<00:03, 102MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  40% 3.97G/9.98G [00:40<00:31, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  40% 4.00G/9.98G [00:41<00:33, 180MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  90% 3.17G/3.50G [00:41<00:03, 96.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  91% 3.18G/3.50G [00:41<00:03, 93.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  40% 4.02G/9.98G [00:41<00:38, 156MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  40% 4.04G/9.98G [00:41<00:36, 163MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  91% 3.19G/3.50G [00:41<00:03, 90.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  41% 4.06G/9.98G [00:41<00:33, 175MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  91% 3.20G/3.50G [00:41<00:03, 89.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  41% 4.08G/9.98G [00:41<00:33, 178MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  92% 3.21G/3.50G [00:41<00:03, 89.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  41% 4.10G/9.98G [00:41<00:33, 175MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  92% 3.22G/3.50G [00:41<00:03, 87.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  41% 4.12G/9.98G [00:41<00:34, 172MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  92% 3.23G/3.50G [00:41<00:03, 87.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  42% 4.14G/9.98G [00:41<00:34, 170MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  93% 3.24G/3.50G [00:41<00:02, 87.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  42% 4.16G/9.98G [00:42<00:32, 177MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  93% 3.25G/3.50G [00:42<00:02, 86.1MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  42% 4.18G/9.98G [00:42<00:34, 170MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  93% 3.26G/3.50G [00:42<00:02, 85.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  42% 4.20G/9.98G [00:42<00:34, 168MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  93% 3.27G/3.50G [00:42<00:02, 82.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  42% 4.23G/9.98G [00:42<00:34, 169MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  94% 3.28G/3.50G [00:42<00:02, 83.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  43% 4.25G/9.98G [00:42<00:34, 166MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  94% 3.29G/3.50G [00:42<00:02, 80.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  43% 4.27G/9.98G [00:42<00:34, 166MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  94% 3.30G/3.50G [00:42<00:02, 81.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  43% 4.29G/9.98G [00:42<00:36, 158MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  95% 3.31G/3.50G [00:42<00:02, 78.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  43% 4.31G/9.98G [00:42<00:37, 150MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  95% 3.32G/3.50G [00:42<00:02, 80.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  95% 3.33G/3.50G [00:43<00:02, 81.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  43% 4.33G/9.98G [00:43<00:44, 128MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  96% 3.34G/3.50G [00:43<00:01, 82.3MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  44% 4.35G/9.98G [00:43<00:41, 137MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  96% 3.36G/3.50G [00:43<00:01, 79.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  44% 4.37G/9.98G [00:43<00:41, 135MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  96% 3.37G/3.50G [00:43<00:01, 82.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  96% 3.38G/3.50G [00:43<00:01, 83.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  44% 4.39G/9.98G [00:43<00:40, 137MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  97% 3.39G/3.50G [00:43<00:01, 82.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  44% 4.41G/9.98G [00:43<00:40, 136MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  97% 3.40G/3.50G [00:43<00:01, 84.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  44% 4.44G/9.98G [00:43<00:40, 135MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  97% 3.41G/3.50G [00:43<00:01, 84.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  98% 3.42G/3.50G [00:44<00:00, 84.8MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  45% 4.46G/9.98G [00:44<00:42, 131MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  98% 3.43G/3.50G [00:44<00:00, 84.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  45% 4.48G/9.98G [00:44<00:43, 125MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  98% 3.44G/3.50G [00:44<00:00, 85.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  99% 3.45G/3.50G [00:44<00:00, 83.5MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  45% 4.50G/9.98G [00:44<00:46, 119MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  99% 3.46G/3.50G [00:44<00:00, 84.2MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  45% 4.52G/9.98G [00:44<00:44, 123MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  99% 3.47G/3.50G [00:44<00:00, 83.9MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  46% 4.54G/9.98G [00:44<00:43, 124MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin:  99% 3.48G/3.50G [00:44<00:00, 80.4MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  46% 4.56G/9.98G [00:44<00:39, 138MB/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)l-00002-of-00002.bin: 100% 3.49G/3.50G [00:45<00:00, 81.6MB/s]\u001b[A\u001b[A\n",
+            "Downloading (…)l-00002-of-00002.bin: 100% 3.50G/3.50G [00:45<00:00, 77.6MB/s]\n",
+            "\n",
+            "Downloading (…)l-00001-of-00002.bin:  46% 4.60G/9.98G [00:45<00:41, 129MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  46% 4.62G/9.98G [00:45<00:40, 134MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  47% 4.65G/9.98G [00:45<00:36, 148MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  47% 4.67G/9.98G [00:45<00:32, 162MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  47% 4.70G/9.98G [00:45<00:29, 179MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  47% 4.72G/9.98G [00:45<00:28, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  48% 4.75G/9.98G [00:46<00:26, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  48% 4.77G/9.98G [00:46<00:26, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  48% 4.79G/9.98G [00:46<00:25, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  48% 4.82G/9.98G [00:46<00:25, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  49% 4.85G/9.98G [00:46<00:24, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  49% 4.89G/9.98G [00:46<00:24, 209MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  49% 4.91G/9.98G [00:46<00:24, 208MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  50% 4.94G/9.98G [00:46<00:23, 211MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  50% 4.97G/9.98G [00:47<00:23, 214MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  50% 5.00G/9.98G [00:47<00:26, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  50% 5.02G/9.98G [00:47<00:27, 180MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  51% 5.05G/9.98G [00:47<00:25, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  51% 5.09G/9.98G [00:47<00:24, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  51% 5.12G/9.98G [00:47<00:23, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  52% 5.15G/9.98G [00:47<00:22, 210MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  52% 5.18G/9.98G [00:48<00:22, 215MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  52% 5.21G/9.98G [00:48<00:21, 218MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  53% 5.24G/9.98G [00:48<00:21, 220MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  53% 5.27G/9.98G [00:48<00:21, 219MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  53% 5.31G/9.98G [00:48<00:21, 218MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  53% 5.34G/9.98G [00:48<00:21, 217MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  54% 5.37G/9.98G [00:48<00:21, 213MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  54% 5.40G/9.98G [00:49<00:30, 149MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  54% 5.42G/9.98G [00:49<00:29, 157MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  55% 5.44G/9.98G [00:49<00:27, 164MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  55% 5.46G/9.98G [00:49<00:26, 170MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  55% 5.48G/9.98G [00:49<00:25, 178MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  55% 5.52G/9.98G [00:49<00:23, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  56% 5.55G/9.98G [00:50<00:22, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  56% 5.58G/9.98G [00:50<00:21, 208MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  56% 5.61G/9.98G [00:50<00:20, 211MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  57% 5.64G/9.98G [00:50<00:20, 214MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  57% 5.67G/9.98G [00:50<00:19, 219MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  57% 5.70G/9.98G [00:50<00:19, 220MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  57% 5.74G/9.98G [00:50<00:19, 216MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  58% 5.77G/9.98G [00:51<00:19, 217MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  58% 5.80G/9.98G [00:51<00:19, 219MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  58% 5.83G/9.98G [00:51<00:18, 220MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  59% 5.86G/9.98G [00:51<00:21, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  59% 5.88G/9.98G [00:51<00:21, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  59% 5.90G/9.98G [00:51<00:21, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  59% 5.92G/9.98G [00:51<00:21, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  60% 5.95G/9.98G [00:51<00:20, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  60% 5.97G/9.98G [00:52<00:20, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  60% 5.99G/9.98G [00:52<00:20, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  60% 6.01G/9.98G [00:52<00:20, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  60% 6.03G/9.98G [00:52<00:20, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  61% 6.06G/9.98G [00:52<00:18, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  61% 6.09G/9.98G [00:52<00:18, 210MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  61% 6.12G/9.98G [00:52<00:18, 211MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  62% 6.16G/9.98G [00:53<00:20, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  62% 6.18G/9.98G [00:53<00:21, 175MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  62% 6.21G/9.98G [00:53<00:20, 188MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  63% 6.24G/9.98G [00:53<00:19, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  63% 6.27G/9.98G [00:53<00:18, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  63% 6.30G/9.98G [00:53<00:17, 208MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  63% 6.33G/9.98G [00:53<00:17, 210MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  64% 6.36G/9.98G [00:54<00:17, 209MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  64% 6.40G/9.98G [00:54<00:17, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  64% 6.42G/9.98G [00:54<00:17, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  65% 6.44G/9.98G [00:54<00:17, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  65% 6.46G/9.98G [00:54<00:17, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  65% 6.48G/9.98G [00:54<00:17, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  65% 6.50G/9.98G [00:54<00:18, 188MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  65% 6.52G/9.98G [00:54<00:18, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  66% 6.54G/9.98G [00:54<00:18, 191MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  66% 6.56G/9.98G [00:55<00:20, 168MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  66% 6.59G/9.98G [00:55<00:20, 168MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  66% 6.61G/9.98G [00:55<00:19, 175MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  66% 6.63G/9.98G [00:55<00:18, 183MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  67% 6.66G/9.98G [00:55<00:17, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  67% 6.69G/9.98G [00:55<00:16, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  67% 6.72G/9.98G [00:55<00:15, 208MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  68% 6.74G/9.98G [00:56<00:15, 209MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  68% 6.77G/9.98G [00:56<00:15, 212MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  68% 6.81G/9.98G [00:56<00:15, 211MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  69% 6.84G/9.98G [00:56<00:15, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  69% 6.86G/9.98G [00:56<00:15, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  69% 6.88G/9.98G [00:56<00:15, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  69% 6.90G/9.98G [00:56<00:15, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  69% 6.93G/9.98G [00:56<00:14, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  70% 6.95G/9.98G [00:57<00:15, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  70% 6.97G/9.98G [00:57<00:16, 184MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  70% 6.99G/9.98G [00:57<00:18, 160MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  70% 7.01G/9.98G [00:57<00:19, 148MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  71% 7.04G/9.98G [00:57<00:19, 150MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  71% 7.07G/9.98G [00:57<00:17, 170MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  71% 7.10G/9.98G [00:57<00:15, 186MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  71% 7.13G/9.98G [00:58<00:14, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  72% 7.16G/9.98G [00:58<00:13, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  72% 7.19G/9.98G [00:58<00:13, 210MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  72% 7.22G/9.98G [00:58<00:12, 215MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  73% 7.26G/9.98G [00:58<00:12, 216MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  73% 7.29G/9.98G [00:58<00:12, 217MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  73% 7.32G/9.98G [00:58<00:12, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  74% 7.35G/9.98G [00:59<00:12, 212MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  74% 7.38G/9.98G [00:59<00:14, 176MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  74% 7.40G/9.98G [00:59<00:17, 151MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  74% 7.42G/9.98G [00:59<00:17, 143MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  75% 7.44G/9.98G [00:59<00:18, 136MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  75% 7.47G/9.98G [01:00<00:20, 120MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  75% 7.49G/9.98G [01:00<00:26, 93.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  75% 7.51G/9.98G [01:00<00:33, 73.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  75% 7.53G/9.98G [01:01<00:28, 85.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  76% 7.54G/9.98G [01:01<00:27, 87.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  76% 7.55G/9.98G [01:01<00:27, 87.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  76% 7.57G/9.98G [01:01<00:27, 87.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  76% 7.58G/9.98G [01:01<00:26, 89.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  76% 7.60G/9.98G [01:01<00:22, 108MB/s] \u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  76% 7.62G/9.98G [01:01<00:19, 124MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  77% 7.64G/9.98G [01:02<00:16, 139MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  77% 7.67G/9.98G [01:02<00:15, 151MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  77% 7.69G/9.98G [01:02<00:14, 163MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  77% 7.72G/9.98G [01:02<00:12, 180MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  78% 7.74G/9.98G [01:02<00:12, 180MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  78% 7.77G/9.98G [01:02<00:11, 191MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  78% 7.80G/9.98G [01:02<00:10, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  79% 7.83G/9.98G [01:02<00:10, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  79% 7.86G/9.98G [01:03<00:10, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  79% 7.89G/9.98G [01:03<00:11, 181MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  79% 7.92G/9.98G [01:03<00:10, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  80% 7.94G/9.98G [01:03<00:10, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  80% 7.97G/9.98G [01:03<00:09, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  80% 8.00G/9.98G [01:03<00:09, 210MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  81% 8.03G/9.98G [01:03<00:09, 213MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  81% 8.06G/9.98G [01:04<00:09, 212MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  81% 8.10G/9.98G [01:04<00:08, 209MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  81% 8.13G/9.98G [01:04<00:09, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  82% 8.15G/9.98G [01:04<00:09, 191MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  82% 8.17G/9.98G [01:04<00:09, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  82% 8.19G/9.98G [01:04<00:09, 184MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  82% 8.21G/9.98G [01:04<00:09, 177MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  83% 8.23G/9.98G [01:05<00:09, 180MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  83% 8.25G/9.98G [01:05<00:11, 145MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  83% 8.27G/9.98G [01:05<00:11, 151MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  83% 8.29G/9.98G [01:05<00:10, 162MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  83% 8.33G/9.98G [01:05<00:09, 181MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  84% 8.36G/9.98G [01:05<00:08, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  84% 8.39G/9.98G [01:05<00:07, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  84% 8.41G/9.98G [01:05<00:07, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  85% 8.44G/9.98G [01:06<00:07, 210MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  85% 8.47G/9.98G [01:06<00:07, 213MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  85% 8.50G/9.98G [01:06<00:06, 211MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  86% 8.54G/9.98G [01:06<00:07, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  86% 8.56G/9.98G [01:06<00:07, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  86% 8.59G/9.98G [01:06<00:06, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  86% 8.62G/9.98G [01:06<00:06, 208MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  87% 8.65G/9.98G [01:07<00:06, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  87% 8.67G/9.98G [01:07<00:06, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  87% 8.70G/9.98G [01:07<00:06, 211MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  88% 8.73G/9.98G [01:07<00:05, 214MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  88% 8.77G/9.98G [01:07<00:05, 217MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  88% 8.80G/9.98G [01:07<00:05, 217MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  88% 8.83G/9.98G [01:08<00:05, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  89% 8.85G/9.98G [01:08<00:05, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  89% 8.87G/9.98G [01:08<00:05, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  89% 8.89G/9.98G [01:08<00:05, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  89% 8.91G/9.98G [01:08<00:05, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  90% 8.93G/9.98G [01:08<00:05, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  90% 8.95G/9.98G [01:08<00:05, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  90% 8.98G/9.98G [01:08<00:05, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  90% 9.00G/9.98G [01:08<00:05, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  90% 9.02G/9.98G [01:09<00:05, 186MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  91% 9.04G/9.98G [01:09<00:05, 172MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  91% 9.06G/9.98G [01:09<00:05, 171MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  91% 9.08G/9.98G [01:09<00:05, 176MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  91% 9.10G/9.98G [01:09<00:06, 131MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  91% 9.12G/9.98G [01:10<00:09, 92.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  92% 9.14G/9.98G [01:10<00:08, 97.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  92% 9.16G/9.98G [01:10<00:07, 112MB/s] \u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  92% 9.19G/9.98G [01:10<00:06, 127MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  92% 9.21G/9.98G [01:10<00:06, 120MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  92% 9.23G/9.98G [01:10<00:07, 100MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  93% 9.25G/9.98G [01:11<00:07, 91.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  93% 9.27G/9.98G [01:11<00:06, 108MB/s] \u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  93% 9.29G/9.98G [01:11<00:07, 87.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  93% 9.31G/9.98G [01:11<00:06, 100MB/s] \u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  94% 9.33G/9.98G [01:11<00:05, 116MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  94% 9.35G/9.98G [01:12<00:04, 133MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  94% 9.37G/9.98G [01:12<00:04, 149MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  94% 9.41G/9.98G [01:12<00:03, 167MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  94% 9.43G/9.98G [01:12<00:03, 172MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  95% 9.45G/9.98G [01:12<00:02, 177MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  95% 9.48G/9.98G [01:12<00:02, 191MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  95% 9.50G/9.98G [01:12<00:02, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  96% 9.53G/9.98G [01:12<00:02, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  96% 9.55G/9.98G [01:12<00:02, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  96% 9.57G/9.98G [01:13<00:02, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  96% 9.59G/9.98G [01:13<00:02, 169MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  96% 9.63G/9.98G [01:13<00:01, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  97% 9.66G/9.98G [01:13<00:01, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  97% 9.69G/9.98G [01:13<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  97% 9.72G/9.98G [01:13<00:01, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  98% 9.75G/9.98G [01:14<00:01, 211MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  98% 9.78G/9.98G [01:14<00:00, 212MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  98% 9.81G/9.98G [01:14<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  99% 9.84G/9.98G [01:14<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  99% 9.86G/9.98G [01:14<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  99% 9.88G/9.98G [01:14<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  99% 9.90G/9.98G [01:14<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin:  99% 9.92G/9.98G [01:14<00:00, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin: 100% 9.94G/9.98G [01:14<00:00, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00002.bin: 100% 9.98G/9.98G [01:15<00:00, 132MB/s]\n",
+            "Fetching 11 files: 100% 11/11 [01:15<00:00,  6.86s/it]\n",
+            "Loading ckpt pytorch_model-00001-of-00002.bin\n",
+            "copying base_model.model.model.embed_tokens.weight from 0-th LoRA weight to model.embed_tokens.weight\n",
+            "merging base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.0.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.0.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.0.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.0.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.0.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.0.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.0.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.0.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.0.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.0.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.0.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.0.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.0.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.1.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.1.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.1.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.1.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.1.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.1.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.1.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.1.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.1.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.1.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.1.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.1.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.1.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.1.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.2.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.2.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.2.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.2.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.2.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.2.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.2.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.2.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.2.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.2.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.2.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.2.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.2.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.2.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.3.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.3.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.3.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.3.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.3.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.3.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.3.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.3.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.3.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.3.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.3.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.3.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.3.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.3.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.4.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.4.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.4.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.4.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.4.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.4.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.4.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.4.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.4.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.4.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.4.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.4.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.4.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.4.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.5.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.5.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.5.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.5.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.5.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.5.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.5.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.5.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.5.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.5.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.5.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.5.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.5.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.5.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.6.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.6.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.6.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.6.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.6.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.6.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.6.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.6.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.6.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.6.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.6.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.6.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.6.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.6.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.7.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.7.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.7.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.7.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.7.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.7.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.7.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.7.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.7.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.7.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.7.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.7.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.7.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.7.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.8.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.8.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.8.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.8.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.8.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.8.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.8.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.8.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.8.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.8.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.8.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.8.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.8.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.8.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.9.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.9.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.9.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.9.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.9.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.9.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.9.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.9.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.9.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.9.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.9.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.9.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.9.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.9.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.10.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.10.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.10.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.10.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.10.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.10.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.10.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.10.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.10.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.10.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.10.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.10.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.10.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.10.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.11.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.11.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.11.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.11.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.11.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.11.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.11.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.11.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.11.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.11.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.11.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.11.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.11.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.11.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.12.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.12.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.12.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.12.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.12.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.12.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.12.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.12.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.12.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.12.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.12.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.12.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.12.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.12.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.13.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.13.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.13.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.13.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.13.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.13.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.13.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.13.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.13.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.13.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.13.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.13.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.13.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.13.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.14.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.14.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.14.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.14.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.14.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.14.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.14.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.14.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.14.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.14.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.14.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.14.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.14.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.14.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.15.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.15.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.15.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.15.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.15.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.15.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.15.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.15.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.15.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.15.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.15.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.15.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.15.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.15.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.16.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.16.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.16.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.16.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.16.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.16.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.16.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.16.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.16.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.16.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.16.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.16.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.16.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.16.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.17.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.17.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.17.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.17.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.17.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.17.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.17.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.17.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.17.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.17.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.17.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.17.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.17.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.17.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.18.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.18.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.18.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.18.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.18.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.18.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.18.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.18.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.18.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.18.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.18.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.18.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.18.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.18.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.19.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.19.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.19.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.19.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.19.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.19.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.19.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.19.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.19.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.19.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.19.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.19.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.19.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.19.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.20.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.20.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.20.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.20.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.20.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.20.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.20.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.20.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.20.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.20.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.20.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.20.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.20.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.20.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.21.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.21.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.21.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.21.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.21.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.21.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.21.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.21.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.21.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.21.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.21.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.21.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.21.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.21.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.22.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.22.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.22.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.22.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.22.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.22.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.22.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.22.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.22.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.22.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.22.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.22.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.22.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.22.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.23.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.23.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.23.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.23.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.23.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.23.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.23.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.23.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.23.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.23.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.23.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.23.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.23.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.23.mlp.up_proj.weight\n",
+            "Saving ckpt pytorch_model-00001-of-00002.bin to alpaca-combined in pth format...\n",
+            "Saving shard 1 of 1 into alpaca-combined/L1-consolidated.00.pth\n",
+            "Loading ckpt pytorch_model-00002-of-00002.bin\n",
+            "merging base_model.model.model.layers.24.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.24.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.24.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.24.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.24.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.24.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.24.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.24.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.24.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.24.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.24.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.24.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.24.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.24.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.25.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.25.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.25.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.25.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.25.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.25.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.25.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.25.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.25.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.25.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.25.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.25.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.25.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.25.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.26.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.26.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.26.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.26.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.26.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.26.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.26.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.26.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.26.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.26.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.26.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.26.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.26.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.26.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.27.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.27.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.27.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.27.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.27.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.27.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.27.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.27.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.27.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.27.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.27.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.27.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.27.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.27.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.28.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.28.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.28.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.28.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.28.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.28.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.28.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.28.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.28.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.28.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.28.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.28.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.28.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.28.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.29.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.29.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.29.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.29.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.29.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.29.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.29.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.29.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.29.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.29.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.29.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.29.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.29.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.29.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.30.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.30.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.30.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.30.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.30.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.30.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.30.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.30.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.30.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.30.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.30.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.30.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.30.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.30.mlp.up_proj.weight\n",
+            "merging base_model.model.model.layers.31.self_attn.q_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.31.self_attn.q_proj.weight\n",
+            "merging base_model.model.model.layers.31.self_attn.k_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.31.self_attn.k_proj.weight\n",
+            "merging base_model.model.model.layers.31.self_attn.v_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.31.self_attn.v_proj.weight\n",
+            "merging base_model.model.model.layers.31.self_attn.o_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.31.self_attn.o_proj.weight\n",
+            "merging base_model.model.model.layers.31.mlp.gate_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.31.mlp.gate_proj.weight\n",
+            "merging base_model.model.model.layers.31.mlp.down_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.31.mlp.down_proj.weight\n",
+            "merging base_model.model.model.layers.31.mlp.up_proj.lora_A.weight and lora_B.weight form 0-th LoRA weight to model.layers.31.mlp.up_proj.weight\n",
+            "copying base_model.model.lm_head.weight from 0-th LoRA weight to lm_head.weight\n",
+            "Saving ckpt pytorch_model-00002-of-00002.bin to alpaca-combined in pth format...\n",
+            "Saving shard 1 of 1 into alpaca-combined/L2-consolidated.00.pth\n",
+            "Saving tokenizer\n",
+            "Saving params.json into alpaca-combined/params.json\n",
+            "Loading ['L1-consolidated.00.pth', 'L2-consolidated.00.pth'] ...\n",
+            "Saving the merged shard to alpaca-combined/consolidated.00.pth\n",
+            "Cleaning up...\n",
+            "Done.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 比对SHA256\n",
+        "\n",
+        "完整值：https://github.com/ymcui/Chinese-LLaMA-Alpaca/blob/main/SHA256.md\n",
+        "\n",
+        "其中本示例生成的Alpaca-7B的标准SHA256：\n",
+        "- fbfccc91183169842aac8d093379f0a449b5a26c5ee7a298baf0d556f1499b90\n",
+        "\n",
+        "使用下述命令评测后发现两者相同，合并无误。"
+      ],
+      "metadata": {
+        "id": "iO6f_kZOPB_q"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!sha256sum alpaca-combined/consolidated.*.pth"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "t5u4QDNZPYI_",
+        "outputId": "d0ceb9f9-b2bd-424d-eff7-b7e4dcb459d0"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "fbfccc91183169842aac8d093379f0a449b5a26c5ee7a298baf0d556f1499b90  alpaca-combined/consolidated.00.pth\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 量化模型\n",
+        "接下来我们使用[llama.cpp](https://github.com/ggerganov/llama.cpp)工具对上一步生成的全量版本权重进行转换，生成4-bit量化模型。\n",
+        "\n",
+        "### 编译工具\n",
+        "\n",
+        "首先对llama.cpp工具进行编译。"
+      ],
+      "metadata": {
+        "id": "ueexcKo-Q_EW"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && make"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_GbjsT2wRRCR",
+        "outputId": "2d66c72f-0ef1-4a56-eebb-1a658827e8e3"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "I llama.cpp build info: \n",
+            "I UNAME_S:  Linux\n",
+            "I UNAME_P:  x86_64\n",
+            "I UNAME_M:  x86_64\n",
+            "I CFLAGS:   -I.              -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS\n",
+            "I CXXFLAGS: -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS\n",
+            "I LDFLAGS:  \n",
+            "I CC:       cc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "I CXX:      g++ (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "\n",
+            "cc  -I.              -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS   -c ggml.c -o ggml.o\n",
+            "g++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS -c llama.cpp -o llama.o\n",
+            "\u001b[01m\u001b[Kllama.cpp:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[Kbool kv_cache_init(const llama_hparams&, llama_kv_cache&, ggml_type, int, int)\u001b[m\u001b[K’:\n",
+            "\u001b[01m\u001b[Kllama.cpp:877:38:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kunused parameter ‘\u001b[01m\u001b[Kn_gpu_layers\u001b[m\u001b[K’ [\u001b[01;35m\u001b[K-Wunused-parameter\u001b[m\u001b[K]\n",
+            "  877 |                                \u001b[01;35m\u001b[Kint   n_gpu_layers\u001b[m\u001b[K) {\n",
+            "      |                                \u001b[01;35m\u001b[K~~~~~~^~~~~~~~~~~~\u001b[m\u001b[K\n",
+            "g++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS -c examples/common.cpp -o common.o\n",
+            "cc -I.              -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS   -c -o k_quants.o k_quants.c\n",
+            "g++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS examples/main/main.cpp ggml.o llama.o common.o k_quants.o -o main \n",
+            "\n",
+            "====  Run ./main -h for help.  ====\n",
+            "\n",
+            "g++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS examples/quantize/quantize.cpp ggml.o llama.o k_quants.o -o quantize \n",
+            "g++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS examples/quantize-stats/quantize-stats.cpp ggml.o llama.o k_quants.o -o quantize-stats \n",
+            "g++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS examples/perplexity/perplexity.cpp ggml.o llama.o common.o k_quants.o -o perplexity \n",
+            "g++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS examples/embedding/embedding.cpp ggml.o llama.o common.o k_quants.o -o embedding \n",
+            "g++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -DGGML_USE_K_QUANTS pocs/vdot/vdot.cpp ggml.o k_quants.o -o vdot \n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 模型转换为ggml格式（FP16）\n",
+        "\n",
+        "这一步，我们将模型转换为ggml格式（FP16）。\n",
+        "- 在这之前需要把`alpaca-combined`目录挪个位置，把模型文件放到`llama.cpp/zh-models/7B`下，把`tokenizer.model`放到`llama.cpp/zh-models`\n",
+        "- tokenizer在哪里？\n",
+        "    - `alpaca-combined`目录下有\n",
+        "    - 或者从以下网址下载：https://huggingface.co/ziqingyang/chinese-alpaca-lora-7b/resolve/main/tokenizer.model （注意，Alpaca和LLaMA的`tokenizer.model`不能混用！）\n",
+        "\n",
+        "💡 转换13B/33B模型提示：\n",
+        "- tokenizer可以直接用7B的，13B/33B和7B的相同\n",
+        "- Alpaca和LLaMA的`tokenizer.model`不能混用！\n",
+        "- 以下看到7B字样的都是文件夹名，与转换过程没有关系了，改不改都行"
+      ],
+      "metadata": {
+        "id": "gw2xpYC0RcQC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && mkdir zh-models && mv ../alpaca-combined zh-models/7B\n",
+        "!mv llama.cpp/zh-models/7B/tokenizer.model llama.cpp/zh-models/\n",
+        "!ls llama.cpp/zh-models/"
+      ],
+      "metadata": {
+        "id": "5KgnFVStRjio",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "02a9192c-941d-4636-befc-2d4c981d65e8"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "7B  tokenizer.model\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && python convert.py zh-models/7B/"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "NUHeoTMQS1AQ",
+        "outputId": "95ec2886-a8c7-4537-9242-131f9d235f33"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Loading model file zh-models/7B/consolidated.00.pth\n",
+            "Loading vocab file zh-models/tokenizer.model\n",
+            "Writing vocab...\n",
+            "[  1/291] Writing tensor tok_embeddings.weight                  | size  49954 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  2/291] Writing tensor norm.weight                            | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[  3/291] Writing tensor output.weight                          | size  49954 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  4/291] Writing tensor layers.0.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  5/291] Writing tensor layers.0.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  6/291] Writing tensor layers.0.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  7/291] Writing tensor layers.0.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  8/291] Writing tensor layers.0.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[  9/291] Writing tensor layers.0.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 10/291] Writing tensor layers.0.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 11/291] Writing tensor layers.0.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 12/291] Writing tensor layers.0.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 13/291] Writing tensor layers.1.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 14/291] Writing tensor layers.1.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 15/291] Writing tensor layers.1.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 16/291] Writing tensor layers.1.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 17/291] Writing tensor layers.1.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 18/291] Writing tensor layers.1.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 19/291] Writing tensor layers.1.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 20/291] Writing tensor layers.1.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 21/291] Writing tensor layers.1.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 22/291] Writing tensor layers.2.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 23/291] Writing tensor layers.2.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 24/291] Writing tensor layers.2.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 25/291] Writing tensor layers.2.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 26/291] Writing tensor layers.2.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 27/291] Writing tensor layers.2.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 28/291] Writing tensor layers.2.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 29/291] Writing tensor layers.2.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 30/291] Writing tensor layers.2.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 31/291] Writing tensor layers.3.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 32/291] Writing tensor layers.3.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 33/291] Writing tensor layers.3.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 34/291] Writing tensor layers.3.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 35/291] Writing tensor layers.3.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 36/291] Writing tensor layers.3.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 37/291] Writing tensor layers.3.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 38/291] Writing tensor layers.3.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 39/291] Writing tensor layers.3.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 40/291] Writing tensor layers.4.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 41/291] Writing tensor layers.4.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 42/291] Writing tensor layers.4.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 43/291] Writing tensor layers.4.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 44/291] Writing tensor layers.4.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 45/291] Writing tensor layers.4.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 46/291] Writing tensor layers.4.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 47/291] Writing tensor layers.4.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 48/291] Writing tensor layers.4.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 49/291] Writing tensor layers.5.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 50/291] Writing tensor layers.5.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 51/291] Writing tensor layers.5.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 52/291] Writing tensor layers.5.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 53/291] Writing tensor layers.5.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 54/291] Writing tensor layers.5.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 55/291] Writing tensor layers.5.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 56/291] Writing tensor layers.5.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 57/291] Writing tensor layers.5.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 58/291] Writing tensor layers.6.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 59/291] Writing tensor layers.6.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 60/291] Writing tensor layers.6.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 61/291] Writing tensor layers.6.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 62/291] Writing tensor layers.6.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 63/291] Writing tensor layers.6.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 64/291] Writing tensor layers.6.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 65/291] Writing tensor layers.6.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 66/291] Writing tensor layers.6.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 67/291] Writing tensor layers.7.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 68/291] Writing tensor layers.7.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 69/291] Writing tensor layers.7.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 70/291] Writing tensor layers.7.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 71/291] Writing tensor layers.7.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 72/291] Writing tensor layers.7.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 73/291] Writing tensor layers.7.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 74/291] Writing tensor layers.7.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 75/291] Writing tensor layers.7.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 76/291] Writing tensor layers.8.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 77/291] Writing tensor layers.8.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 78/291] Writing tensor layers.8.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 79/291] Writing tensor layers.8.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 80/291] Writing tensor layers.8.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 81/291] Writing tensor layers.8.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 82/291] Writing tensor layers.8.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 83/291] Writing tensor layers.8.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 84/291] Writing tensor layers.8.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 85/291] Writing tensor layers.9.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 86/291] Writing tensor layers.9.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 87/291] Writing tensor layers.9.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 88/291] Writing tensor layers.9.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 89/291] Writing tensor layers.9.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 90/291] Writing tensor layers.9.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 91/291] Writing tensor layers.9.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 92/291] Writing tensor layers.9.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 93/291] Writing tensor layers.9.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 94/291] Writing tensor layers.10.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 95/291] Writing tensor layers.10.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 96/291] Writing tensor layers.10.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 97/291] Writing tensor layers.10.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 98/291] Writing tensor layers.10.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 99/291] Writing tensor layers.10.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[100/291] Writing tensor layers.10.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[101/291] Writing tensor layers.10.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[102/291] Writing tensor layers.10.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[103/291] Writing tensor layers.11.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[104/291] Writing tensor layers.11.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[105/291] Writing tensor layers.11.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[106/291] Writing tensor layers.11.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[107/291] Writing tensor layers.11.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[108/291] Writing tensor layers.11.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[109/291] Writing tensor layers.11.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[110/291] Writing tensor layers.11.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[111/291] Writing tensor layers.11.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[112/291] Writing tensor layers.12.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[113/291] Writing tensor layers.12.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[114/291] Writing tensor layers.12.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[115/291] Writing tensor layers.12.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[116/291] Writing tensor layers.12.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[117/291] Writing tensor layers.12.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[118/291] Writing tensor layers.12.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[119/291] Writing tensor layers.12.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[120/291] Writing tensor layers.12.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[121/291] Writing tensor layers.13.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[122/291] Writing tensor layers.13.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[123/291] Writing tensor layers.13.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[124/291] Writing tensor layers.13.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[125/291] Writing tensor layers.13.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[126/291] Writing tensor layers.13.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[127/291] Writing tensor layers.13.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[128/291] Writing tensor layers.13.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[129/291] Writing tensor layers.13.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[130/291] Writing tensor layers.14.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[131/291] Writing tensor layers.14.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[132/291] Writing tensor layers.14.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[133/291] Writing tensor layers.14.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[134/291] Writing tensor layers.14.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[135/291] Writing tensor layers.14.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[136/291] Writing tensor layers.14.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[137/291] Writing tensor layers.14.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[138/291] Writing tensor layers.14.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[139/291] Writing tensor layers.15.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[140/291] Writing tensor layers.15.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[141/291] Writing tensor layers.15.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[142/291] Writing tensor layers.15.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[143/291] Writing tensor layers.15.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[144/291] Writing tensor layers.15.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[145/291] Writing tensor layers.15.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[146/291] Writing tensor layers.15.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[147/291] Writing tensor layers.15.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[148/291] Writing tensor layers.16.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[149/291] Writing tensor layers.16.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[150/291] Writing tensor layers.16.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[151/291] Writing tensor layers.16.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[152/291] Writing tensor layers.16.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[153/291] Writing tensor layers.16.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[154/291] Writing tensor layers.16.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[155/291] Writing tensor layers.16.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[156/291] Writing tensor layers.16.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[157/291] Writing tensor layers.17.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[158/291] Writing tensor layers.17.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[159/291] Writing tensor layers.17.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[160/291] Writing tensor layers.17.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[161/291] Writing tensor layers.17.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[162/291] Writing tensor layers.17.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[163/291] Writing tensor layers.17.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[164/291] Writing tensor layers.17.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[165/291] Writing tensor layers.17.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[166/291] Writing tensor layers.18.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[167/291] Writing tensor layers.18.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[168/291] Writing tensor layers.18.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[169/291] Writing tensor layers.18.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[170/291] Writing tensor layers.18.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[171/291] Writing tensor layers.18.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[172/291] Writing tensor layers.18.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[173/291] Writing tensor layers.18.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[174/291] Writing tensor layers.18.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[175/291] Writing tensor layers.19.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[176/291] Writing tensor layers.19.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[177/291] Writing tensor layers.19.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[178/291] Writing tensor layers.19.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[179/291] Writing tensor layers.19.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[180/291] Writing tensor layers.19.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[181/291] Writing tensor layers.19.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[182/291] Writing tensor layers.19.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[183/291] Writing tensor layers.19.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[184/291] Writing tensor layers.20.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[185/291] Writing tensor layers.20.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[186/291] Writing tensor layers.20.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[187/291] Writing tensor layers.20.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[188/291] Writing tensor layers.20.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[189/291] Writing tensor layers.20.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[190/291] Writing tensor layers.20.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[191/291] Writing tensor layers.20.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[192/291] Writing tensor layers.20.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[193/291] Writing tensor layers.21.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[194/291] Writing tensor layers.21.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[195/291] Writing tensor layers.21.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[196/291] Writing tensor layers.21.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[197/291] Writing tensor layers.21.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[198/291] Writing tensor layers.21.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[199/291] Writing tensor layers.21.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[200/291] Writing tensor layers.21.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[201/291] Writing tensor layers.21.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[202/291] Writing tensor layers.22.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[203/291] Writing tensor layers.22.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[204/291] Writing tensor layers.22.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[205/291] Writing tensor layers.22.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[206/291] Writing tensor layers.22.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[207/291] Writing tensor layers.22.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[208/291] Writing tensor layers.22.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[209/291] Writing tensor layers.22.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[210/291] Writing tensor layers.22.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[211/291] Writing tensor layers.23.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[212/291] Writing tensor layers.23.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[213/291] Writing tensor layers.23.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[214/291] Writing tensor layers.23.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[215/291] Writing tensor layers.23.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[216/291] Writing tensor layers.23.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[217/291] Writing tensor layers.23.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[218/291] Writing tensor layers.23.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[219/291] Writing tensor layers.23.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[220/291] Writing tensor layers.24.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[221/291] Writing tensor layers.24.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[222/291] Writing tensor layers.24.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[223/291] Writing tensor layers.24.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[224/291] Writing tensor layers.24.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[225/291] Writing tensor layers.24.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[226/291] Writing tensor layers.24.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[227/291] Writing tensor layers.24.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[228/291] Writing tensor layers.24.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[229/291] Writing tensor layers.25.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[230/291] Writing tensor layers.25.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[231/291] Writing tensor layers.25.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[232/291] Writing tensor layers.25.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[233/291] Writing tensor layers.25.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[234/291] Writing tensor layers.25.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[235/291] Writing tensor layers.25.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[236/291] Writing tensor layers.25.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[237/291] Writing tensor layers.25.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[238/291] Writing tensor layers.26.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[239/291] Writing tensor layers.26.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[240/291] Writing tensor layers.26.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[241/291] Writing tensor layers.26.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[242/291] Writing tensor layers.26.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[243/291] Writing tensor layers.26.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[244/291] Writing tensor layers.26.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[245/291] Writing tensor layers.26.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[246/291] Writing tensor layers.26.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[247/291] Writing tensor layers.27.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[248/291] Writing tensor layers.27.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[249/291] Writing tensor layers.27.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[250/291] Writing tensor layers.27.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[251/291] Writing tensor layers.27.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[252/291] Writing tensor layers.27.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[253/291] Writing tensor layers.27.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[254/291] Writing tensor layers.27.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[255/291] Writing tensor layers.27.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[256/291] Writing tensor layers.28.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[257/291] Writing tensor layers.28.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[258/291] Writing tensor layers.28.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[259/291] Writing tensor layers.28.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[260/291] Writing tensor layers.28.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[261/291] Writing tensor layers.28.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[262/291] Writing tensor layers.28.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[263/291] Writing tensor layers.28.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[264/291] Writing tensor layers.28.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[265/291] Writing tensor layers.29.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[266/291] Writing tensor layers.29.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[267/291] Writing tensor layers.29.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[268/291] Writing tensor layers.29.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[269/291] Writing tensor layers.29.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[270/291] Writing tensor layers.29.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[271/291] Writing tensor layers.29.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[272/291] Writing tensor layers.29.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[273/291] Writing tensor layers.29.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[274/291] Writing tensor layers.30.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[275/291] Writing tensor layers.30.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[276/291] Writing tensor layers.30.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[277/291] Writing tensor layers.30.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[278/291] Writing tensor layers.30.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[279/291] Writing tensor layers.30.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[280/291] Writing tensor layers.30.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[281/291] Writing tensor layers.30.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[282/291] Writing tensor layers.30.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[283/291] Writing tensor layers.31.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[284/291] Writing tensor layers.31.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[285/291] Writing tensor layers.31.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[286/291] Writing tensor layers.31.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[287/291] Writing tensor layers.31.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[288/291] Writing tensor layers.31.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[289/291] Writing tensor layers.31.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[290/291] Writing tensor layers.31.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[291/291] Writing tensor layers.31.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "Wrote zh-models/7B/ggml-model-f16.bin\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 将FP16模型量化为4-bit\n",
+        "\n",
+        "我们进一步将FP16模型转换为4-bit量化模型，此处选择的是新版Q4_K方法。"
+      ],
+      "metadata": {
+        "id": "hEZEJAVYCHkc"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && ./quantize ./zh-models/7B/ggml-model-f16.bin ./zh-models/7B/ggml-model-q4_K.bin q4_K"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "2xyais7OUVDI",
+        "outputId": "ebe6c758-15ff-4150-f68f-c5cddb1dfff6"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "main: build = 670 (254a7a7)\n",
+            "main: quantizing './zh-models/7B/ggml-model-f16.bin' to './zh-models/7B/ggml-model-q4_K.bin' as Q4_K\n",
+            "llama.cpp: loading model from ./zh-models/7B/ggml-model-f16.bin\n",
+            "llama.cpp: saving model to ./zh-models/7B/ggml-model-q4_K.bin\n",
+            "[   1/ 291]                tok_embeddings.weight -     4096 x 49954, type =    f16, quantizing .. size =   390.27 MB ->   109.76 MB | hist: \n",
+            "[   2/ 291]                          norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[   3/ 291]                        output.weight -     4096 x 49954, type =    f16, quantizing .. size =   390.27 MB ->   160.07 MB | hist: \n",
+            "[   4/ 291]         layers.0.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[   5/ 291]         layers.0.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[   6/ 291]         layers.0.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[   7/ 291]         layers.0.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[   8/ 291]       layers.0.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[   9/ 291]      layers.0.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  10/ 291]      layers.0.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[  11/ 291]      layers.0.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  12/ 291]             layers.0.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  13/ 291]         layers.1.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  14/ 291]         layers.1.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  15/ 291]         layers.1.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[  16/ 291]         layers.1.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  17/ 291]       layers.1.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  18/ 291]      layers.1.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  19/ 291]      layers.1.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[  20/ 291]      layers.1.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  21/ 291]             layers.1.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  22/ 291]         layers.2.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  23/ 291]         layers.2.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  24/ 291]         layers.2.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[  25/ 291]         layers.2.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  26/ 291]       layers.2.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  27/ 291]      layers.2.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  28/ 291]      layers.2.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[  29/ 291]      layers.2.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  30/ 291]             layers.2.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  31/ 291]         layers.3.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  32/ 291]         layers.3.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  33/ 291]         layers.3.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[  34/ 291]         layers.3.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  35/ 291]       layers.3.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  36/ 291]      layers.3.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  37/ 291]      layers.3.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[  38/ 291]      layers.3.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  39/ 291]             layers.3.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  40/ 291]         layers.4.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  41/ 291]         layers.4.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  42/ 291]         layers.4.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  43/ 291]         layers.4.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  44/ 291]       layers.4.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  45/ 291]      layers.4.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  46/ 291]      layers.4.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  47/ 291]      layers.4.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  48/ 291]             layers.4.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  49/ 291]         layers.5.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  50/ 291]         layers.5.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  51/ 291]         layers.5.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  52/ 291]         layers.5.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  53/ 291]       layers.5.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  54/ 291]      layers.5.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  55/ 291]      layers.5.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  56/ 291]      layers.5.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  57/ 291]             layers.5.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  58/ 291]         layers.6.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  59/ 291]         layers.6.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  60/ 291]         layers.6.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[  61/ 291]         layers.6.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  62/ 291]       layers.6.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  63/ 291]      layers.6.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  64/ 291]      layers.6.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[  65/ 291]      layers.6.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  66/ 291]             layers.6.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  67/ 291]         layers.7.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  68/ 291]         layers.7.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  69/ 291]         layers.7.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  70/ 291]         layers.7.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  71/ 291]       layers.7.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  72/ 291]      layers.7.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  73/ 291]      layers.7.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  74/ 291]      layers.7.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  75/ 291]             layers.7.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  76/ 291]         layers.8.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  77/ 291]         layers.8.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  78/ 291]         layers.8.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  79/ 291]         layers.8.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  80/ 291]       layers.8.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  81/ 291]      layers.8.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  82/ 291]      layers.8.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  83/ 291]      layers.8.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  84/ 291]             layers.8.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  85/ 291]         layers.9.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  86/ 291]         layers.9.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  87/ 291]         layers.9.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[  88/ 291]         layers.9.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  89/ 291]       layers.9.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  90/ 291]      layers.9.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  91/ 291]      layers.9.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[  92/ 291]      layers.9.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  93/ 291]             layers.9.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  94/ 291]        layers.10.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  95/ 291]        layers.10.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  96/ 291]        layers.10.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  97/ 291]        layers.10.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  98/ 291]      layers.10.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  99/ 291]     layers.10.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 100/ 291]     layers.10.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 101/ 291]     layers.10.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 102/ 291]            layers.10.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 103/ 291]        layers.11.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 104/ 291]        layers.11.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 105/ 291]        layers.11.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 106/ 291]        layers.11.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 107/ 291]      layers.11.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 108/ 291]     layers.11.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 109/ 291]     layers.11.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 110/ 291]     layers.11.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 111/ 291]            layers.11.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 112/ 291]        layers.12.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 113/ 291]        layers.12.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 114/ 291]        layers.12.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 115/ 291]        layers.12.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 116/ 291]      layers.12.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 117/ 291]     layers.12.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 118/ 291]     layers.12.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 119/ 291]     layers.12.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 120/ 291]            layers.12.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 121/ 291]        layers.13.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 122/ 291]        layers.13.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 123/ 291]        layers.13.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 124/ 291]        layers.13.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 125/ 291]      layers.13.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 126/ 291]     layers.13.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 127/ 291]     layers.13.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 128/ 291]     layers.13.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 129/ 291]            layers.13.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 130/ 291]        layers.14.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 131/ 291]        layers.14.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 132/ 291]        layers.14.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 133/ 291]        layers.14.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 134/ 291]      layers.14.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 135/ 291]     layers.14.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 136/ 291]     layers.14.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 137/ 291]     layers.14.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 138/ 291]            layers.14.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 139/ 291]        layers.15.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 140/ 291]        layers.15.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 141/ 291]        layers.15.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 142/ 291]        layers.15.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 143/ 291]      layers.15.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 144/ 291]     layers.15.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 145/ 291]     layers.15.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 146/ 291]     layers.15.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 147/ 291]            layers.15.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 148/ 291]        layers.16.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 149/ 291]        layers.16.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 150/ 291]        layers.16.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 151/ 291]        layers.16.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 152/ 291]      layers.16.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 153/ 291]     layers.16.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 154/ 291]     layers.16.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 155/ 291]     layers.16.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 156/ 291]            layers.16.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 157/ 291]        layers.17.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 158/ 291]        layers.17.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 159/ 291]        layers.17.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 160/ 291]        layers.17.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 161/ 291]      layers.17.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 162/ 291]     layers.17.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 163/ 291]     layers.17.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 164/ 291]     layers.17.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 165/ 291]            layers.17.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 166/ 291]        layers.18.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 167/ 291]        layers.18.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 168/ 291]        layers.18.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 169/ 291]        layers.18.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 170/ 291]      layers.18.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 171/ 291]     layers.18.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 172/ 291]     layers.18.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 173/ 291]     layers.18.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 174/ 291]            layers.18.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 175/ 291]        layers.19.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 176/ 291]        layers.19.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 177/ 291]        layers.19.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 178/ 291]        layers.19.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 179/ 291]      layers.19.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 180/ 291]     layers.19.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 181/ 291]     layers.19.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 182/ 291]     layers.19.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 183/ 291]            layers.19.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 184/ 291]        layers.20.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 185/ 291]        layers.20.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 186/ 291]        layers.20.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 187/ 291]        layers.20.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 188/ 291]      layers.20.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 189/ 291]     layers.20.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 190/ 291]     layers.20.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 191/ 291]     layers.20.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 192/ 291]            layers.20.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 193/ 291]        layers.21.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 194/ 291]        layers.21.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 195/ 291]        layers.21.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 196/ 291]        layers.21.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 197/ 291]      layers.21.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 198/ 291]     layers.21.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 199/ 291]     layers.21.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 200/ 291]     layers.21.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 201/ 291]            layers.21.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 202/ 291]        layers.22.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 203/ 291]        layers.22.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 204/ 291]        layers.22.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 205/ 291]        layers.22.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 206/ 291]      layers.22.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 207/ 291]     layers.22.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 208/ 291]     layers.22.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 209/ 291]     layers.22.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 210/ 291]            layers.22.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 211/ 291]        layers.23.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 212/ 291]        layers.23.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 213/ 291]        layers.23.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 214/ 291]        layers.23.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 215/ 291]      layers.23.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 216/ 291]     layers.23.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 217/ 291]     layers.23.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 218/ 291]     layers.23.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 219/ 291]            layers.23.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 220/ 291]        layers.24.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 221/ 291]        layers.24.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 222/ 291]        layers.24.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 223/ 291]        layers.24.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 224/ 291]      layers.24.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 225/ 291]     layers.24.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 226/ 291]     layers.24.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 227/ 291]     layers.24.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 228/ 291]            layers.24.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 229/ 291]        layers.25.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 230/ 291]        layers.25.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 231/ 291]        layers.25.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 232/ 291]        layers.25.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 233/ 291]      layers.25.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 234/ 291]     layers.25.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 235/ 291]     layers.25.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 236/ 291]     layers.25.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 237/ 291]            layers.25.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 238/ 291]        layers.26.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 239/ 291]        layers.26.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 240/ 291]        layers.26.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 241/ 291]        layers.26.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 242/ 291]      layers.26.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 243/ 291]     layers.26.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 244/ 291]     layers.26.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 245/ 291]     layers.26.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 246/ 291]            layers.26.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 247/ 291]        layers.27.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 248/ 291]        layers.27.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 249/ 291]        layers.27.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 250/ 291]        layers.27.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 251/ 291]      layers.27.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 252/ 291]     layers.27.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 253/ 291]     layers.27.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 254/ 291]     layers.27.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 255/ 291]            layers.27.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 256/ 291]        layers.28.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 257/ 291]        layers.28.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 258/ 291]        layers.28.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 259/ 291]        layers.28.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 260/ 291]      layers.28.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 261/ 291]     layers.28.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 262/ 291]     layers.28.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 263/ 291]     layers.28.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 264/ 291]            layers.28.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 265/ 291]        layers.29.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 266/ 291]        layers.29.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 267/ 291]        layers.29.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 268/ 291]        layers.29.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 269/ 291]      layers.29.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 270/ 291]     layers.29.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 271/ 291]     layers.29.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 272/ 291]     layers.29.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 273/ 291]            layers.29.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 274/ 291]        layers.30.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 275/ 291]        layers.30.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 276/ 291]        layers.30.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 277/ 291]        layers.30.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 278/ 291]      layers.30.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 279/ 291]     layers.30.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 280/ 291]     layers.30.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 281/ 291]     layers.30.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 282/ 291]            layers.30.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 283/ 291]        layers.31.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 284/ 291]        layers.31.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 285/ 291]        layers.31.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    13.12 MB | hist: \n",
+            "[ 286/ 291]        layers.31.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 287/ 291]      layers.31.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 288/ 291]     layers.31.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 289/ 291]     layers.31.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    35.27 MB | hist: \n",
+            "[ 290/ 291]     layers.31.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 291/ 291]            layers.31.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "llama_model_quantize_internal: model size  = 13133.55 MB\n",
+            "llama_model_quantize_internal: quant size  =  3988.22 MB\n",
+            "\n",
+            "main: quantize time = 153421.48 ms\n",
+            "main:    total time = 153421.48 ms\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### （可选）测试量化模型解码\n",
+        "至此已完成了所有转换步骤。\n",
+        "我们运行一条命令测试一下是否能够正常加载并进行对话。\n",
+        "\n",
+        "FP16和Q4量化文件存放在./llama.cpp/zh-models/7B下，可按需下载使用。"
+      ],
+      "metadata": {
+        "id": "DLkuRAo9Vkb1"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q4_K.bin --color -p \"详细介绍一下北京的名胜古迹：\" -n 128"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "tW-ep1BsVQtG",
+        "outputId": "03f0343f-3b7c-490e-a0ab-6724d79c5dc8"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "main: build = 670 (254a7a7)\n",
+            "main: seed  = 1686819449\n",
+            "llama.cpp: loading model from ./zh-models/7B/ggml-model-q4_K.bin\n",
+            "llama_model_load_internal: format     = ggjt v3 (latest)\n",
+            "llama_model_load_internal: n_vocab    = 49954\n",
+            "llama_model_load_internal: n_ctx      = 512\n",
+            "llama_model_load_internal: n_embd     = 4096\n",
+            "llama_model_load_internal: n_mult     = 256\n",
+            "llama_model_load_internal: n_head     = 32\n",
+            "llama_model_load_internal: n_layer    = 32\n",
+            "llama_model_load_internal: n_rot      = 128\n",
+            "llama_model_load_internal: ftype      = 15 (mostly Q4_K - Medium)\n",
+            "llama_model_load_internal: n_ff       = 11008\n",
+            "llama_model_load_internal: n_parts    = 1\n",
+            "llama_model_load_internal: model size = 7B\n",
+            "llama_model_load_internal: ggml ctx size =    0.07 MB\n",
+            "llama_model_load_internal: mem required  = 5780.29 MB (+ 1026.00 MB per state)\n",
+            "................................................................................................\n",
+            "llama_init_from_file: kv self size  =  256.00 MB\n",
+            "\n",
+            "system_info: n_threads = 4 / 4 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
+            "sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000\n",
+            "generate: n_ctx = 512, n_batch = 512, n_predict = 128, n_keep = 0\n",
+            "\n",
+            "\n",
+            "\u001b[33m 详细介绍一下北京的名胜古迹：\u001b[0m天安门、故宫、颐和园、圆明园、北海公园等。 参观后你一定会爱上这座城市！ [end of text]\n",
+            "\n",
+            "llama_print_timings:        load time = 16410.24 ms\n",
+            "llama_print_timings:      sample time =    30.04 ms /    30 runs   (    1.00 ms per token)\n",
+            "llama_print_timings: prompt eval time =  3479.21 ms /    11 tokens (  316.29 ms per token)\n",
+            "llama_print_timings:        eval time = 10516.40 ms /    29 runs   (  362.63 ms per token)\n",
+            "llama_print_timings:       total time = 14042.46 ms\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/notebooks/legacy/convert_and_quantize_chinese_alpaca_plus.ipynb b/notebooks/legacy/convert_and_quantize_chinese_alpaca_plus.ipynb
new file mode 100644
index 0000000..b3bf1e3
--- /dev/null
+++ b/notebooks/legacy/convert_and_quantize_chinese_alpaca_plus.ipynb
@@ -0,0 +1,1171 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B1c96_k3MahN"
+      },
+      "source": [
+        "# 转换并量化中文Alpaca Plus模型\n",
+        "\n",
+        "关于其他模型请参考另一个notebook：https://colab.research.google.com/drive/1Eak6azD3MLeb-YsfbP8UZC8wrL1ddIMI?usp=sharing\n",
+        "\n",
+        "\n",
+        "🎉🎉🎉 **新：现在免费用户也有机会能够转换7B和13B模型了！**\n",
+        "\n",
+        "💡 提示和小窍门：\n",
+        "- 免费用户默认的内存只有12G左右，**笔者用免费账号实测选择TPU的话有机会随机出35G内存**，建议多试几次。如果能随机出25G内存以上的机器就可以了转换7B模型了，35G内存以上机器就能转换13B模型了\n",
+        "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n",
+        "- 实测：转换7B级别模型，25G内存的机器就够了；转换13B级别模型需要30G以上的内存（程序莫名崩掉或断开连接就说明内存爆了）\n",
+        "- 如果选了“高RAM”之后内存还是不够大的话，选择以下操作，有的时候会分配出很高内存的机器，祝你好运😄！\n",
+        "    - 可以把GPU或者TPU也选上（虽然不会用到）\n",
+        "    - 选GPU时，Pro用户可选“高级”类型GPU\n",
+        "\n",
+        "以下信息配置信息供参考（Pro订阅下测试），运行时规格设置为“高RAM”时的设备配置如下（有随机性）：\n",
+        "\n",
+        "| 硬件加速器  |  RAM  |  硬盘  |\n",
+        "| :-- | :--: | :--: |\n",
+        "| None | 25GB | 225GB |\n",
+        "| TPU | 35GB | 225GB |\n",
+        "| GPU（标准，T4）| 25GB | 166GB |\n",
+        "| GPU（高性能，V100）| 25GB | 166GB |\n",
+        "| GPU（高性能，A100）| **80GB** | 166GB |\n",
+        "\n",
+        "*温馨提示：用完之后注意断开运行时，选择满足要求的最低配置即可，避免不必要的计算单元消耗（Pro只给100个计算单元）。*"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vScqHD_jMFOV"
+      },
+      "source": [
+        "## 安装相关依赖"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "E5WKFJXIL6ZU",
+        "outputId": "87a89bed-053e-4e61-e2f8-1dfcbdf87fbf"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting torch==1.12.0\n",
+            "  Downloading torch-1.12.0-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m776.3/776.3 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==1.12.0) (4.5.0)\n",
+            "Installing collected packages: torch\n",
+            "  Attempting uninstall: torch\n",
+            "    Found existing installation: torch 2.0.0+cu118\n",
+            "    Uninstalling torch-2.0.0+cu118:\n",
+            "      Successfully uninstalled torch-2.0.0+cu118\n",
+            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+            "torchvision 0.15.1+cu118 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n",
+            "torchtext 0.15.1 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n",
+            "torchdata 0.6.0 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n",
+            "torchaudio 2.0.1+cu118 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n",
+            "peft 0.2.0 requires torch>=1.13.0, but you have torch 1.12.0 which is incompatible.\u001b[0m\u001b[31m\n",
+            "\u001b[0mSuccessfully installed torch-1.12.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.28.1)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n",
+            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (2023.4.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting git+https://github.com/huggingface/peft\n",
+            "  Cloning https://github.com/huggingface/peft to /tmp/pip-req-build-tnxzt7q0\n",
+            "  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft /tmp/pip-req-build-tnxzt7q0\n",
+            "  Resolved https://github.com/huggingface/peft to commit 632997d1fb776c3cf05d8c2537ac9a98a7ce9435\n",
+            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (23.1)\n",
+            "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (0.18.0)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (1.22.4)\n",
+            "Collecting torch>=1.13.0\n",
+            "  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m619.9/619.9 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (6.0)\n",
+            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (5.9.5)\n",
+            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (4.28.1)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1)\n",
+            "Collecting nvidia-cufft-cu11==10.9.0.58\n",
+            "  Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.4/168.4 MB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting nvidia-cudnn-cu11==8.5.0.96\n",
+            "  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m557.1/557.1 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (2.0.0)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.12.0)\n",
+            "Collecting nvidia-cuda-runtime-cu11==11.7.99\n",
+            "  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m849.3/849.3 kB\u001b[0m \u001b[31m48.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1.2)\n",
+            "Collecting nvidia-nccl-cu11==2.14.3\n",
+            "  Downloading nvidia_nccl_cu11-2.14.3-py3-none-manylinux1_x86_64.whl (177.1 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.1/177.1 MB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (1.11.1)\n",
+            "Collecting nvidia-cusparse-cu11==11.7.4.91\n",
+            "  Downloading nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m173.2/173.2 MB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting nvidia-cublas-cu11==11.10.3.66\n",
+            "  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.1/317.1 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting nvidia-nvtx-cu11==11.7.91\n",
+            "  Downloading nvidia_nvtx_cu11-11.7.91-py3-none-manylinux1_x86_64.whl (98 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.6/98.6 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (4.5.0)\n",
+            "Collecting nvidia-curand-cu11==10.2.10.91\n",
+            "  Downloading nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.6/54.6 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting nvidia-cusolver-cu11==11.4.0.1\n",
+            "  Downloading nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.6/102.6 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99\n",
+            "  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.0/21.0 MB\u001b[0m \u001b[31m63.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101\n",
+            "  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.8/11.8 MB\u001b[0m \u001b[31m75.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0.dev0) (0.40.0)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0.dev0) (67.7.2)\n",
+            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (3.25.2)\n",
+            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (16.0.2)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.14.1)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2022.10.31)\n",
+            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.13.3)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (4.65.0)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2.27.1)\n",
+            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers->peft==0.3.0.dev0) (2023.4.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.13.0->peft==0.3.0.dev0) (2.1.2)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2.0.12)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2022.12.7)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (3.4)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (1.26.15)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.13.0->peft==0.3.0.dev0) (1.3.0)\n",
+            "Building wheels for collected packages: peft\n",
+            "  Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for peft: filename=peft-0.3.0.dev0-py3-none-any.whl size=55537 sha256=3cc2a65c09926ac217ac671b7d9c1640eac9857f0aca55b78a9fcda484263073\n",
+            "  Stored in directory: /tmp/pip-ephem-wheel-cache-1rjlvx70/wheels/4c/16/67/1002a2d4daa822eff130e6d85b90051b75d2ce0d26b9448e4a\n",
+            "Successfully built peft\n",
+            "Installing collected packages: nvidia-nvtx-cu11, nvidia-nccl-cu11, nvidia-cusparse-cu11, nvidia-curand-cu11, nvidia-cufft-cu11, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-cupti-cu11, nvidia-cublas-cu11, nvidia-cusolver-cu11, nvidia-cudnn-cu11, torch, peft\n",
+            "  Attempting uninstall: torch\n",
+            "    Found existing installation: torch 1.12.0\n",
+            "    Uninstalling torch-1.12.0:\n",
+            "      Successfully uninstalled torch-1.12.0\n",
+            "  Attempting uninstall: peft\n",
+            "    Found existing installation: peft 0.2.0\n",
+            "    Uninstalling peft-0.2.0:\n",
+            "      Successfully uninstalled peft-0.2.0\n",
+            "Successfully installed nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-cupti-cu11-11.7.101 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.2.10.91 nvidia-cusolver-cu11-11.4.0.1 nvidia-cusparse-cu11-11.7.4.91 nvidia-nccl-cu11-2.14.3 nvidia-nvtx-cu11-11.7.91 peft-0.3.0.dev0 torch-2.0.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.98)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install torch==1.12.0\n",
+        "!pip install transformers\n",
+        "!pip install git+https://github.com/huggingface/peft\n",
+        "!pip install sentencepiece"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ygb1xFIMNQKw"
+      },
+      "source": [
+        "## 克隆目录和代码"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yCEJh7NJNXz9",
+        "outputId": "ec16f31b-7af7-4eb8-82ce-5f9317bad941"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Cloning into 'Chinese-LLaMA-Alpaca'...\n",
+            "remote: Enumerating objects: 761, done.\u001b[K\n",
+            "remote: Counting objects: 100% (202/202), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (172/172), done.\u001b[K\n",
+            "remote: Total 761 (delta 54), reused 69 (delta 29), pack-reused 559\u001b[K\n",
+            "Receiving objects: 100% (761/761), 11.16 MiB | 22.49 MiB/s, done.\n",
+            "Resolving deltas: 100% (444/444), done.\n",
+            "Cloning into 'llama.cpp'...\n",
+            "remote: Enumerating objects: 2086, done.\u001b[K\n",
+            "remote: Counting objects: 100% (842/842), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (99/99), done.\u001b[K\n",
+            "remote: Total 2086 (delta 778), reused 756 (delta 743), pack-reused 1244\u001b[K\n",
+            "Receiving objects: 100% (2086/2086), 2.12 MiB | 16.33 MiB/s, done.\n",
+            "Resolving deltas: 100% (1345/1345), done.\n"
+          ]
+        }
+      ],
+      "source": [
+        "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n",
+        "!git clone https://github.com/ggerganov/llama.cpp"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nIyxX0DSNsgQ"
+      },
+      "source": [
+        "## 合并模型（Alpaca-Plus-7B）\n",
+        "\n",
+        "**⚠️ 再次提醒：7B模型需要25G内存，13B模型需要35G+内存。**\n",
+        "\n",
+        "此处使用的是🤗模型库中提供的基模型（已是HF格式），而不是Facebook官方的LLaMA模型，因此略去将原版LLaMA转换为HF格式的步骤。\n",
+        "\n",
+        "**这里直接运行第二步：合并LoRA权重**，生成全量模型权重。可以直接指定🤗模型库的地址，也可以是本地存放地址。\n",
+        "- 基模型：`decapoda-research/llama-7b-hf` *（use at your own risk）*\n",
+        "- LoRA模型：先写`ziqingyang/chinese-llama-plus-lora-7b`然后再写`ziqingyang/chinese-alpaca-plus-lora-7b`\n",
+        "- 输出类型：因为后续要量化，这里将`output_type`设置为`pth`\n",
+        "\n",
+        "💡 转换13B模型提示：\n",
+        "- 请将参数`--base_model`和`--lora_model`中的的`7b`改为`13b`即可\n",
+        "- **免费用户必须增加一个参数`--offload_dir`以缓解内存压力**，例如`--offload_dir ./offload_temp`\n",
+        "\n",
+        "该过程比较耗时（下载+转换），需要几分钟到十几分钟不等，请耐心等待。\n",
+        "转换好的模型存放在`alpaca-combined`目录。\n",
+        "如果你不需要量化模型，那么到这一步就结束了。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5AV4EW5hNhVV",
+        "outputId": "91901b82-88c4-405d-cf86-32f1a3a60467"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "2023-04-28 08:07:00.276520: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "Base model: decapoda-research/llama-7b-hf\n",
+            "LoRA model(s) ['ziqingyang/chinese-llama-plus-lora-7b', 'ziqingyang/chinese-alpaca-plus-lora-7b']:\n",
+            "Loading checkpoint shards: 100% 33/33 [01:18<00:00,  2.39s/it]\n",
+            "Peft version: 0.3.0.dev0\n",
+            "Loading LoRA for 7B model\n",
+            "Loading LoRA ziqingyang/chinese-llama-plus-lora-7b\n",
+            "Extended vocabulary size to 49953\n",
+            "Downloading (…)/adapter_config.json: 100% 420/420 [00:00<00:00, 1.61MB/s]\n",
+            "Downloading adapter_model.bin: 100% 858M/858M [00:04<00:00, 185MB/s]\n",
+            "Merging with merge_and_unload...\n",
+            "Loading LoRA ziqingyang/chinese-alpaca-plus-lora-7b\n",
+            "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 13.4MB/s]\n",
+            "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 535kB/s]\n",
+            "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 854kB/s]\n",
+            "Extended vocabulary size to 49954\n",
+            "Downloading (…)/adapter_config.json: 100% 423/423 [00:00<00:00, 2.31MB/s]\n",
+            "Downloading adapter_model.bin: 100% 1.14G/1.14G [00:16<00:00, 70.6MB/s]\n",
+            "Merging with merge_and_unload...\n",
+            "Saving to pth format...\n",
+            "Saving shard 1 of 1 into alpaca-combined/consolidated.00.pth\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora.py \\\n",
+        "    --base_model decapoda-research/llama-7b-hf \\\n",
+        "    --lora_model ziqingyang/chinese-llama-plus-lora-7b,ziqingyang/chinese-alpaca-plus-lora-7b \\\n",
+        "    --output_type pth \\\n",
+        "    --output_dir alpaca-combined"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ueexcKo-Q_EW"
+      },
+      "source": [
+        "## 量化模型\n",
+        "接下来我们使用[llama.cpp](https://github.com/ggerganov/llama.cpp)工具对上一步生成的全量版本权重进行转换，生成4-bit量化模型。\n",
+        "\n",
+        "### 编译工具\n",
+        "\n",
+        "首先对llama.cpp工具进行编译。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_GbjsT2wRRCR",
+        "outputId": "2b4f2a38-d22d-4764-9a81-bad8bd72b7fe"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "I llama.cpp build info: \n",
+            "I UNAME_S:  Linux\n",
+            "I UNAME_P:  x86_64\n",
+            "I UNAME_M:  x86_64\n",
+            "I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native\n",
+            "I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native\n",
+            "I LDFLAGS:  \n",
+            "I CC:       cc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "I CXX:      g++ (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "\n",
+            "cc  -I.              -O3 -DNDEBUG -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native   -c ggml.c -o ggml.o\n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c llama.cpp -o llama.o\n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c examples/common.cpp -o common.o\n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/main/main.cpp ggml.o llama.o common.o -o main \n",
+            "\n",
+            "====  Run ./main -h for help.  ====\n",
+            "\n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize/quantize.cpp ggml.o llama.o -o quantize \n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats \n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity \n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding \n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native pocs/vdot/vdot.cpp ggml.o -o vdot \n"
+          ]
+        }
+      ],
+      "source": [
+        "!cd llama.cpp && make"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gw2xpYC0RcQC"
+      },
+      "source": [
+        "### 模型转换为ggml格式（FP16）\n",
+        "\n",
+        "这一步，我们将模型转换为ggml格式（FP16）。\n",
+        "- 在这之前需要把`alpaca-combined`目录挪个位置，把模型文件放到`llama.cpp/zh-models/7B`下，把`tokenizer.model`放到`llama.cpp/zh-models`\n",
+        "- tokenizer在哪里？\n",
+        "    - `alpaca-combined`目录下有\n",
+        "    - 或者从以下网址下载：https://huggingface.co/ziqingyang/chinese-alpaca-lora-7b/resolve/main/tokenizer.model （注意，Alpaca和LLaMA的`tokenizer.model`不能混用！）\n",
+        "\n",
+        "💡 转换13B模型提示：\n",
+        "- tokenizer可以直接用7B的，13B和7B的相同\n",
+        "- Alpaca和LLaMA的`tokenizer.model`不能混用！\n",
+        "- 以下看到7B字样的都是文件夹名，与转换过程没有关系了，改不改都行"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5KgnFVStRjio",
+        "outputId": "19293a4a-a400-4cd3-c98b-80022dcd1f35"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "7B  tokenizer.model\n"
+          ]
+        }
+      ],
+      "source": [
+        "!cd llama.cpp && mkdir zh-models && mv ../alpaca-combined zh-models/7B\n",
+        "!mv llama.cpp/zh-models/7B/tokenizer.model llama.cpp/zh-models/\n",
+        "!ls llama.cpp/zh-models/"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "NUHeoTMQS1AQ",
+        "outputId": "378b70db-d13b-4aa9-8bb0-a1fc1cd4b13f"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Loading model file zh-models/7B/consolidated.00.pth\n",
+            "Loading vocab file zh-models/tokenizer.model\n",
+            "Writing vocab...\n",
+            "[  1/291] Writing tensor tok_embeddings.weight                  | size  49954 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  2/291] Writing tensor norm.weight                            | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[  3/291] Writing tensor output.weight                          | size  49954 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  4/291] Writing tensor layers.0.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  5/291] Writing tensor layers.0.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  6/291] Writing tensor layers.0.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  7/291] Writing tensor layers.0.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[  8/291] Writing tensor layers.0.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[  9/291] Writing tensor layers.0.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 10/291] Writing tensor layers.0.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 11/291] Writing tensor layers.0.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 12/291] Writing tensor layers.0.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 13/291] Writing tensor layers.1.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 14/291] Writing tensor layers.1.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 15/291] Writing tensor layers.1.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 16/291] Writing tensor layers.1.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 17/291] Writing tensor layers.1.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 18/291] Writing tensor layers.1.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 19/291] Writing tensor layers.1.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 20/291] Writing tensor layers.1.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 21/291] Writing tensor layers.1.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 22/291] Writing tensor layers.2.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 23/291] Writing tensor layers.2.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 24/291] Writing tensor layers.2.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 25/291] Writing tensor layers.2.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 26/291] Writing tensor layers.2.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 27/291] Writing tensor layers.2.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 28/291] Writing tensor layers.2.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 29/291] Writing tensor layers.2.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 30/291] Writing tensor layers.2.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 31/291] Writing tensor layers.3.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 32/291] Writing tensor layers.3.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 33/291] Writing tensor layers.3.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 34/291] Writing tensor layers.3.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 35/291] Writing tensor layers.3.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 36/291] Writing tensor layers.3.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 37/291] Writing tensor layers.3.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 38/291] Writing tensor layers.3.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 39/291] Writing tensor layers.3.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 40/291] Writing tensor layers.4.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 41/291] Writing tensor layers.4.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 42/291] Writing tensor layers.4.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 43/291] Writing tensor layers.4.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 44/291] Writing tensor layers.4.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 45/291] Writing tensor layers.4.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 46/291] Writing tensor layers.4.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 47/291] Writing tensor layers.4.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 48/291] Writing tensor layers.4.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 49/291] Writing tensor layers.5.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 50/291] Writing tensor layers.5.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 51/291] Writing tensor layers.5.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 52/291] Writing tensor layers.5.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 53/291] Writing tensor layers.5.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 54/291] Writing tensor layers.5.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 55/291] Writing tensor layers.5.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 56/291] Writing tensor layers.5.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 57/291] Writing tensor layers.5.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 58/291] Writing tensor layers.6.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 59/291] Writing tensor layers.6.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 60/291] Writing tensor layers.6.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 61/291] Writing tensor layers.6.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 62/291] Writing tensor layers.6.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 63/291] Writing tensor layers.6.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 64/291] Writing tensor layers.6.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 65/291] Writing tensor layers.6.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 66/291] Writing tensor layers.6.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 67/291] Writing tensor layers.7.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 68/291] Writing tensor layers.7.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 69/291] Writing tensor layers.7.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 70/291] Writing tensor layers.7.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 71/291] Writing tensor layers.7.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 72/291] Writing tensor layers.7.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 73/291] Writing tensor layers.7.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 74/291] Writing tensor layers.7.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 75/291] Writing tensor layers.7.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 76/291] Writing tensor layers.8.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 77/291] Writing tensor layers.8.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 78/291] Writing tensor layers.8.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 79/291] Writing tensor layers.8.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 80/291] Writing tensor layers.8.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 81/291] Writing tensor layers.8.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 82/291] Writing tensor layers.8.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 83/291] Writing tensor layers.8.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 84/291] Writing tensor layers.8.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 85/291] Writing tensor layers.9.attention.wq.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 86/291] Writing tensor layers.9.attention.wk.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 87/291] Writing tensor layers.9.attention.wv.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 88/291] Writing tensor layers.9.attention.wo.weight           | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 89/291] Writing tensor layers.9.attention_norm.weight         | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 90/291] Writing tensor layers.9.feed_forward.w1.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 91/291] Writing tensor layers.9.feed_forward.w2.weight        | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[ 92/291] Writing tensor layers.9.feed_forward.w3.weight        | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 93/291] Writing tensor layers.9.ffn_norm.weight               | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 94/291] Writing tensor layers.10.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 95/291] Writing tensor layers.10.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 96/291] Writing tensor layers.10.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 97/291] Writing tensor layers.10.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[ 98/291] Writing tensor layers.10.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[ 99/291] Writing tensor layers.10.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[100/291] Writing tensor layers.10.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[101/291] Writing tensor layers.10.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[102/291] Writing tensor layers.10.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[103/291] Writing tensor layers.11.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[104/291] Writing tensor layers.11.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[105/291] Writing tensor layers.11.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[106/291] Writing tensor layers.11.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[107/291] Writing tensor layers.11.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[108/291] Writing tensor layers.11.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[109/291] Writing tensor layers.11.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[110/291] Writing tensor layers.11.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[111/291] Writing tensor layers.11.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[112/291] Writing tensor layers.12.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[113/291] Writing tensor layers.12.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[114/291] Writing tensor layers.12.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[115/291] Writing tensor layers.12.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[116/291] Writing tensor layers.12.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[117/291] Writing tensor layers.12.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[118/291] Writing tensor layers.12.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[119/291] Writing tensor layers.12.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[120/291] Writing tensor layers.12.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[121/291] Writing tensor layers.13.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[122/291] Writing tensor layers.13.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[123/291] Writing tensor layers.13.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[124/291] Writing tensor layers.13.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[125/291] Writing tensor layers.13.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[126/291] Writing tensor layers.13.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[127/291] Writing tensor layers.13.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[128/291] Writing tensor layers.13.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[129/291] Writing tensor layers.13.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[130/291] Writing tensor layers.14.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[131/291] Writing tensor layers.14.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[132/291] Writing tensor layers.14.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[133/291] Writing tensor layers.14.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[134/291] Writing tensor layers.14.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[135/291] Writing tensor layers.14.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[136/291] Writing tensor layers.14.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[137/291] Writing tensor layers.14.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[138/291] Writing tensor layers.14.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[139/291] Writing tensor layers.15.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[140/291] Writing tensor layers.15.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[141/291] Writing tensor layers.15.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[142/291] Writing tensor layers.15.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[143/291] Writing tensor layers.15.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[144/291] Writing tensor layers.15.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[145/291] Writing tensor layers.15.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[146/291] Writing tensor layers.15.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[147/291] Writing tensor layers.15.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[148/291] Writing tensor layers.16.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[149/291] Writing tensor layers.16.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[150/291] Writing tensor layers.16.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[151/291] Writing tensor layers.16.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[152/291] Writing tensor layers.16.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[153/291] Writing tensor layers.16.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[154/291] Writing tensor layers.16.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[155/291] Writing tensor layers.16.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[156/291] Writing tensor layers.16.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[157/291] Writing tensor layers.17.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[158/291] Writing tensor layers.17.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[159/291] Writing tensor layers.17.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[160/291] Writing tensor layers.17.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[161/291] Writing tensor layers.17.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[162/291] Writing tensor layers.17.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[163/291] Writing tensor layers.17.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[164/291] Writing tensor layers.17.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[165/291] Writing tensor layers.17.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[166/291] Writing tensor layers.18.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[167/291] Writing tensor layers.18.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[168/291] Writing tensor layers.18.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[169/291] Writing tensor layers.18.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[170/291] Writing tensor layers.18.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[171/291] Writing tensor layers.18.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[172/291] Writing tensor layers.18.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[173/291] Writing tensor layers.18.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[174/291] Writing tensor layers.18.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[175/291] Writing tensor layers.19.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[176/291] Writing tensor layers.19.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[177/291] Writing tensor layers.19.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[178/291] Writing tensor layers.19.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[179/291] Writing tensor layers.19.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[180/291] Writing tensor layers.19.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[181/291] Writing tensor layers.19.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[182/291] Writing tensor layers.19.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[183/291] Writing tensor layers.19.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[184/291] Writing tensor layers.20.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[185/291] Writing tensor layers.20.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[186/291] Writing tensor layers.20.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[187/291] Writing tensor layers.20.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[188/291] Writing tensor layers.20.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[189/291] Writing tensor layers.20.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[190/291] Writing tensor layers.20.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[191/291] Writing tensor layers.20.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[192/291] Writing tensor layers.20.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[193/291] Writing tensor layers.21.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[194/291] Writing tensor layers.21.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[195/291] Writing tensor layers.21.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[196/291] Writing tensor layers.21.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[197/291] Writing tensor layers.21.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[198/291] Writing tensor layers.21.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[199/291] Writing tensor layers.21.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[200/291] Writing tensor layers.21.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[201/291] Writing tensor layers.21.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[202/291] Writing tensor layers.22.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[203/291] Writing tensor layers.22.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[204/291] Writing tensor layers.22.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[205/291] Writing tensor layers.22.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[206/291] Writing tensor layers.22.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[207/291] Writing tensor layers.22.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[208/291] Writing tensor layers.22.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[209/291] Writing tensor layers.22.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[210/291] Writing tensor layers.22.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[211/291] Writing tensor layers.23.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[212/291] Writing tensor layers.23.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[213/291] Writing tensor layers.23.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[214/291] Writing tensor layers.23.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[215/291] Writing tensor layers.23.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[216/291] Writing tensor layers.23.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[217/291] Writing tensor layers.23.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[218/291] Writing tensor layers.23.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[219/291] Writing tensor layers.23.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[220/291] Writing tensor layers.24.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[221/291] Writing tensor layers.24.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[222/291] Writing tensor layers.24.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[223/291] Writing tensor layers.24.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[224/291] Writing tensor layers.24.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[225/291] Writing tensor layers.24.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[226/291] Writing tensor layers.24.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[227/291] Writing tensor layers.24.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[228/291] Writing tensor layers.24.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[229/291] Writing tensor layers.25.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[230/291] Writing tensor layers.25.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[231/291] Writing tensor layers.25.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[232/291] Writing tensor layers.25.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[233/291] Writing tensor layers.25.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[234/291] Writing tensor layers.25.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[235/291] Writing tensor layers.25.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[236/291] Writing tensor layers.25.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[237/291] Writing tensor layers.25.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[238/291] Writing tensor layers.26.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[239/291] Writing tensor layers.26.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[240/291] Writing tensor layers.26.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[241/291] Writing tensor layers.26.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[242/291] Writing tensor layers.26.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[243/291] Writing tensor layers.26.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[244/291] Writing tensor layers.26.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[245/291] Writing tensor layers.26.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[246/291] Writing tensor layers.26.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[247/291] Writing tensor layers.27.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[248/291] Writing tensor layers.27.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[249/291] Writing tensor layers.27.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[250/291] Writing tensor layers.27.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[251/291] Writing tensor layers.27.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[252/291] Writing tensor layers.27.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[253/291] Writing tensor layers.27.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[254/291] Writing tensor layers.27.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[255/291] Writing tensor layers.27.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[256/291] Writing tensor layers.28.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[257/291] Writing tensor layers.28.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[258/291] Writing tensor layers.28.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[259/291] Writing tensor layers.28.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[260/291] Writing tensor layers.28.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[261/291] Writing tensor layers.28.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[262/291] Writing tensor layers.28.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[263/291] Writing tensor layers.28.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[264/291] Writing tensor layers.28.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[265/291] Writing tensor layers.29.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[266/291] Writing tensor layers.29.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[267/291] Writing tensor layers.29.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[268/291] Writing tensor layers.29.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[269/291] Writing tensor layers.29.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[270/291] Writing tensor layers.29.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[271/291] Writing tensor layers.29.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[272/291] Writing tensor layers.29.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[273/291] Writing tensor layers.29.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[274/291] Writing tensor layers.30.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[275/291] Writing tensor layers.30.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[276/291] Writing tensor layers.30.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[277/291] Writing tensor layers.30.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[278/291] Writing tensor layers.30.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[279/291] Writing tensor layers.30.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[280/291] Writing tensor layers.30.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[281/291] Writing tensor layers.30.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[282/291] Writing tensor layers.30.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[283/291] Writing tensor layers.31.attention.wq.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[284/291] Writing tensor layers.31.attention.wk.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[285/291] Writing tensor layers.31.attention.wv.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[286/291] Writing tensor layers.31.attention.wo.weight          | size   4096 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[287/291] Writing tensor layers.31.attention_norm.weight        | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "[288/291] Writing tensor layers.31.feed_forward.w1.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[289/291] Writing tensor layers.31.feed_forward.w2.weight       | size   4096 x  11008  | type UnquantizedDataType(name='F16')\n",
+            "[290/291] Writing tensor layers.31.feed_forward.w3.weight       | size  11008 x   4096  | type UnquantizedDataType(name='F16')\n",
+            "[291/291] Writing tensor layers.31.ffn_norm.weight              | size   4096           | type UnquantizedDataType(name='F32')\n",
+            "Wrote zh-models/7B/ggml-model-f16.bin\n"
+          ]
+        }
+      ],
+      "source": [
+        "!cd llama.cpp && python convert.py zh-models/7B/"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hEZEJAVYCHkc"
+      },
+      "source": [
+        "### 将FP16模型量化为8-bit\n",
+        "\n",
+        "我们进一步将FP16模型转换为8-bit量化模型。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "2xyais7OUVDI",
+        "outputId": "b7fe3c62-489a-42e5-927a-8ab6088a3ecc"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "llama.cpp: loading model from ./zh-models/7B/ggml-model-f16.bin\n",
+            "llama.cpp: saving model to ./zh-models/7B/ggml-model-q4_0.bin\n",
+            "[   1/ 291]                tok_embeddings.weight -     4096 x 49954, type =    f16, quantizing .. size =   390.27 MB ->   219.52 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[   2/ 291]                          norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[   3/ 291]                        output.weight -     4096 x 49954, type =    f16, quantizing .. size =   390.27 MB ->   219.52 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[   4/ 291]         layers.0.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.026 0.018 0.028 0.044 0.064 0.088 0.111 0.245 0.111 0.087 0.064 0.044 0.028 0.018 0.026 \n",
+            "[   5/ 291]         layers.0.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.026 0.017 0.028 0.043 0.063 0.087 0.111 0.250 0.112 0.087 0.063 0.043 0.028 0.017 0.026 \n",
+            "[   6/ 291]         layers.0.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.019 0.031 0.046 0.065 0.087 0.107 0.237 0.107 0.087 0.065 0.046 0.030 0.019 0.027 \n",
+            "[   7/ 291]         layers.0.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.026 0.017 0.027 0.042 0.062 0.087 0.113 0.253 0.113 0.087 0.062 0.042 0.027 0.017 0.026 \n",
+            "[   8/ 291]       layers.0.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[   9/ 291]      layers.0.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  10/ 291]      layers.0.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  11/ 291]      layers.0.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[  12/ 291]             layers.0.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  13/ 291]         layers.1.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.107 0.228 0.107 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  14/ 291]         layers.1.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.019 0.031 0.047 0.067 0.088 0.107 0.229 0.107 0.088 0.067 0.047 0.031 0.019 0.027 \n",
+            "[  15/ 291]         layers.1.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.019 0.030 0.046 0.066 0.088 0.108 0.235 0.108 0.088 0.065 0.046 0.030 0.019 0.027 \n",
+            "[  16/ 291]         layers.1.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.026 0.017 0.027 0.042 0.062 0.087 0.113 0.256 0.113 0.086 0.062 0.042 0.027 0.017 0.026 \n",
+            "[  17/ 291]       layers.1.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  18/ 291]      layers.1.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  19/ 291]      layers.1.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  20/ 291]      layers.1.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  21/ 291]             layers.1.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  22/ 291]         layers.2.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  23/ 291]         layers.2.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.019 0.031 0.047 0.066 0.088 0.107 0.231 0.107 0.088 0.066 0.047 0.031 0.019 0.027 \n",
+            "[  24/ 291]         layers.2.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.087 0.106 0.228 0.106 0.087 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  25/ 291]         layers.2.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.107 0.228 0.107 0.088 0.067 0.047 0.031 0.019 0.027 \n",
+            "[  26/ 291]       layers.2.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  27/ 291]      layers.2.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  28/ 291]      layers.2.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  29/ 291]      layers.2.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  30/ 291]             layers.2.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  31/ 291]         layers.3.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  32/ 291]         layers.3.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.229 0.106 0.088 0.066 0.047 0.031 0.020 0.027 \n",
+            "[  33/ 291]         layers.3.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[  34/ 291]         layers.3.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  35/ 291]       layers.3.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  36/ 291]      layers.3.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  37/ 291]      layers.3.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  38/ 291]      layers.3.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  39/ 291]             layers.3.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  40/ 291]         layers.4.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  41/ 291]         layers.4.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  42/ 291]         layers.4.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  43/ 291]         layers.4.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  44/ 291]       layers.4.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  45/ 291]      layers.4.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  46/ 291]      layers.4.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  47/ 291]      layers.4.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  48/ 291]             layers.4.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  49/ 291]         layers.5.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  50/ 291]         layers.5.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[  51/ 291]         layers.5.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[  52/ 291]         layers.5.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  53/ 291]       layers.5.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  54/ 291]      layers.5.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  55/ 291]      layers.5.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  56/ 291]      layers.5.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  57/ 291]             layers.5.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  58/ 291]         layers.6.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  59/ 291]         layers.6.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n",
+            "[  60/ 291]         layers.6.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[  61/ 291]         layers.6.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  62/ 291]       layers.6.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  63/ 291]      layers.6.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  64/ 291]      layers.6.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  65/ 291]      layers.6.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  66/ 291]             layers.6.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  67/ 291]         layers.7.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  68/ 291]         layers.7.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  69/ 291]         layers.7.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  70/ 291]         layers.7.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  71/ 291]       layers.7.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  72/ 291]      layers.7.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  73/ 291]      layers.7.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n",
+            "[  74/ 291]      layers.7.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  75/ 291]             layers.7.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  76/ 291]         layers.8.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  77/ 291]         layers.8.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n",
+            "[  78/ 291]         layers.8.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  79/ 291]         layers.8.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  80/ 291]       layers.8.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  81/ 291]      layers.8.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  82/ 291]      layers.8.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  83/ 291]      layers.8.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  84/ 291]             layers.8.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  85/ 291]         layers.9.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  86/ 291]         layers.9.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  87/ 291]         layers.9.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[  88/ 291]         layers.9.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  89/ 291]       layers.9.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  90/ 291]      layers.9.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  91/ 291]      layers.9.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  92/ 291]      layers.9.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  93/ 291]             layers.9.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  94/ 291]        layers.10.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  95/ 291]        layers.10.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  96/ 291]        layers.10.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[  97/ 291]        layers.10.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[  98/ 291]      layers.10.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[  99/ 291]     layers.10.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 100/ 291]     layers.10.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 101/ 291]     layers.10.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 102/ 291]            layers.10.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 103/ 291]        layers.11.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 104/ 291]        layers.11.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 105/ 291]        layers.11.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 106/ 291]        layers.11.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 107/ 291]      layers.11.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 108/ 291]     layers.11.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 109/ 291]     layers.11.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 110/ 291]     layers.11.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 111/ 291]            layers.11.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 112/ 291]        layers.12.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 113/ 291]        layers.12.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 114/ 291]        layers.12.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 115/ 291]        layers.12.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 116/ 291]      layers.12.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 117/ 291]     layers.12.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 118/ 291]     layers.12.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 119/ 291]     layers.12.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 120/ 291]            layers.12.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 121/ 291]        layers.13.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 122/ 291]        layers.13.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 123/ 291]        layers.13.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 124/ 291]        layers.13.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 125/ 291]      layers.13.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 126/ 291]     layers.13.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 127/ 291]     layers.13.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 128/ 291]     layers.13.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 129/ 291]            layers.13.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 130/ 291]        layers.14.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 131/ 291]        layers.14.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 132/ 291]        layers.14.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 133/ 291]        layers.14.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 134/ 291]      layers.14.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 135/ 291]     layers.14.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 136/ 291]     layers.14.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 137/ 291]     layers.14.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 138/ 291]            layers.14.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 139/ 291]        layers.15.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 140/ 291]        layers.15.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 141/ 291]        layers.15.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[ 142/ 291]        layers.15.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 143/ 291]      layers.15.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 144/ 291]     layers.15.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 145/ 291]     layers.15.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 146/ 291]     layers.15.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 147/ 291]            layers.15.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 148/ 291]        layers.16.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 149/ 291]        layers.16.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 150/ 291]        layers.16.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[ 151/ 291]        layers.16.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 152/ 291]      layers.16.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 153/ 291]     layers.16.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 154/ 291]     layers.16.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 155/ 291]     layers.16.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 156/ 291]            layers.16.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 157/ 291]        layers.17.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 158/ 291]        layers.17.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 159/ 291]        layers.17.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 160/ 291]        layers.17.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 161/ 291]      layers.17.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 162/ 291]     layers.17.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 163/ 291]     layers.17.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n",
+            "[ 164/ 291]     layers.17.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 165/ 291]            layers.17.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 166/ 291]        layers.18.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 167/ 291]        layers.18.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 168/ 291]        layers.18.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 169/ 291]        layers.18.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 170/ 291]      layers.18.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 171/ 291]     layers.18.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 172/ 291]     layers.18.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 173/ 291]     layers.18.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 174/ 291]            layers.18.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 175/ 291]        layers.19.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 176/ 291]        layers.19.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 177/ 291]        layers.19.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[ 178/ 291]        layers.19.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 179/ 291]      layers.19.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 180/ 291]     layers.19.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 181/ 291]     layers.19.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 182/ 291]     layers.19.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 183/ 291]            layers.19.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 184/ 291]        layers.20.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 185/ 291]        layers.20.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 186/ 291]        layers.20.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 187/ 291]        layers.20.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 188/ 291]      layers.20.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 189/ 291]     layers.20.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 190/ 291]     layers.20.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 191/ 291]     layers.20.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 192/ 291]            layers.20.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 193/ 291]        layers.21.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 194/ 291]        layers.21.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 195/ 291]        layers.21.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 196/ 291]        layers.21.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 197/ 291]      layers.21.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 198/ 291]     layers.21.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 199/ 291]     layers.21.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 200/ 291]     layers.21.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 201/ 291]            layers.21.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 202/ 291]        layers.22.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 203/ 291]        layers.22.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 204/ 291]        layers.22.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[ 205/ 291]        layers.22.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 206/ 291]      layers.22.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 207/ 291]     layers.22.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 208/ 291]     layers.22.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 209/ 291]     layers.22.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 210/ 291]            layers.22.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 211/ 291]        layers.23.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 212/ 291]        layers.23.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 213/ 291]        layers.23.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 214/ 291]        layers.23.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 215/ 291]      layers.23.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 216/ 291]     layers.23.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 217/ 291]     layers.23.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 218/ 291]     layers.23.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 219/ 291]            layers.23.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 220/ 291]        layers.24.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 221/ 291]        layers.24.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 222/ 291]        layers.24.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 223/ 291]        layers.24.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 224/ 291]      layers.24.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 225/ 291]     layers.24.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 226/ 291]     layers.24.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 227/ 291]     layers.24.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 228/ 291]            layers.24.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 229/ 291]        layers.25.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 230/ 291]        layers.25.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 231/ 291]        layers.25.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 232/ 291]        layers.25.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 233/ 291]      layers.25.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 234/ 291]     layers.25.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 235/ 291]     layers.25.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 236/ 291]     layers.25.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 237/ 291]            layers.25.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 238/ 291]        layers.26.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 239/ 291]        layers.26.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 240/ 291]        layers.26.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 241/ 291]        layers.26.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 242/ 291]      layers.26.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 243/ 291]     layers.26.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.068 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 244/ 291]     layers.26.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 245/ 291]     layers.26.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 246/ 291]            layers.26.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 247/ 291]        layers.27.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 248/ 291]        layers.27.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 249/ 291]        layers.27.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 250/ 291]        layers.27.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 251/ 291]      layers.27.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 252/ 291]     layers.27.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 253/ 291]     layers.27.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 254/ 291]     layers.27.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 255/ 291]            layers.27.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 256/ 291]        layers.28.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 257/ 291]        layers.28.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 258/ 291]        layers.28.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 259/ 291]        layers.28.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 260/ 291]      layers.28.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 261/ 291]     layers.28.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 262/ 291]     layers.28.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 263/ 291]     layers.28.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 264/ 291]            layers.28.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 265/ 291]        layers.29.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 266/ 291]        layers.29.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 267/ 291]        layers.29.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 268/ 291]        layers.29.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 269/ 291]      layers.29.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 270/ 291]     layers.29.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 271/ 291]     layers.29.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.107 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 272/ 291]     layers.29.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 273/ 291]            layers.29.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 274/ 291]        layers.30.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[ 275/ 291]        layers.30.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 276/ 291]        layers.30.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n",
+            "[ 277/ 291]        layers.30.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 278/ 291]      layers.30.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 279/ 291]     layers.30.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 280/ 291]     layers.30.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.019 0.030 0.046 0.066 0.088 0.108 0.232 0.108 0.088 0.066 0.046 0.031 0.019 0.027 \n",
+            "[ 281/ 291]     layers.30.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 282/ 291]            layers.30.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 283/ 291]        layers.31.attention.wq.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 284/ 291]        layers.31.attention.wk.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 285/ 291]        layers.31.attention.wv.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n",
+            "[ 286/ 291]        layers.31.attention.wo.weight -     4096 x  4096, type =    f16, quantizing .. size =    32.00 MB ->    18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 287/ 291]      layers.31.attention_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "[ 288/ 291]     layers.31.feed_forward.w1.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 289/ 291]     layers.31.feed_forward.w2.weight -    11008 x  4096, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.019 0.030 0.045 0.065 0.088 0.109 0.237 0.109 0.088 0.065 0.045 0.030 0.019 0.027 \n",
+            "[ 290/ 291]     layers.31.feed_forward.w3.weight -     4096 x 11008, type =    f16, quantizing .. size =    86.00 MB ->    48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "[ 291/ 291]            layers.31.ffn_norm.weight -             4096, type =    f32, size =    0.016 MB\n",
+            "llama_model_quantize_internal: model size  = 13133.55 MB\n",
+            "llama_model_quantize_internal: quant size  =  7388.06 MB\n",
+            "llama_model_quantize_internal: hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n",
+            "\n",
+            "main: quantize time = 146381.23 ms\n",
+            "main:    total time = 146381.23 ms\n"
+          ]
+        }
+      ],
+      "source": [
+        "!cd llama.cpp && ./quantize ./zh-models/7B/ggml-model-f16.bin ./zh-models/7B/ggml-model-q8_0.bin 7"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!sha256sum ./llama.cpp/zh-models/7B/ggml-model-q8_0.bin"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "2PR5jo2P-hOw",
+        "outputId": "2d808543-557d-4d0a-becb-ab35c4ccb8ff"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "0eec8927427f159397c79961a28d62d78849514a4a19033b247edd6ac3fc2cfd  ./llama.cpp/zh-models/7B/ggml-model-q8_0.bin\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DLkuRAo9Vkb1"
+      },
+      "source": [
+        "### （可选）测试量化模型解码\n",
+        "至此已完成了所有转换步骤。\n",
+        "我们运行一条命令测试一下是否能够正常加载并进行对话。\n",
+        "\n",
+        "FP16和Q8量化文件存放在./llama.cpp/zh-models/7B下，可按需下载使用。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "tW-ep1BsVQtG",
+        "outputId": "b3b28e5e-c731-4bb5-d3ae-c09d4c7bfb81"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "main: seed = 1682671021\n",
+            "llama.cpp: loading model from ./zh-models/7B/ggml-model-q8_0.bin\n",
+            "llama_model_load_internal: format     = ggjt v1 (latest)\n",
+            "llama_model_load_internal: n_vocab    = 49954\n",
+            "llama_model_load_internal: n_ctx      = 512\n",
+            "llama_model_load_internal: n_embd     = 4096\n",
+            "llama_model_load_internal: n_mult     = 256\n",
+            "llama_model_load_internal: n_head     = 32\n",
+            "llama_model_load_internal: n_layer    = 32\n",
+            "llama_model_load_internal: n_rot      = 128\n",
+            "llama_model_load_internal: ftype      = 7 (mostly Q8_0)\n",
+            "llama_model_load_internal: n_ff       = 11008\n",
+            "llama_model_load_internal: n_parts    = 1\n",
+            "llama_model_load_internal: model size = 7B\n",
+            "llama_model_load_internal: ggml ctx size =  59.11 KB\n",
+            "llama_model_load_internal: mem required  = 9180.12 MB (+ 1026.00 MB per state)\n",
+            "llama_init_from_file: kv self size  =  256.00 MB\n",
+            "\n",
+            "system_info: n_threads = 4 / 4 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
+            "sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.100000\n",
+            "generate: n_ctx = 512, n_batch = 512, n_predict = 512, n_keep = 0\n",
+            "\n",
+            "\n",
+            "\u001b[33m 详细介绍一下北京的名胜古迹：\u001b[0m长城、故宫等。同时介绍一些小众景点，比如颐和园中的石舫、圆明园中的琉璃花门等等。 [end of text]\n",
+            "\n",
+            "llama_print_timings:        load time = 19881.66 ms\n",
+            "llama_print_timings:      sample time =    48.31 ms /    32 runs   (    1.51 ms per run)\n",
+            "llama_print_timings: prompt eval time = 11365.17 ms /    11 tokens ( 1033.20 ms per token)\n",
+            "llama_print_timings:        eval time = 33910.03 ms /    31 runs   ( 1093.87 ms per run)\n",
+            "llama_print_timings:       total time = 53841.09 ms\n"
+          ]
+        }
+      ],
+      "source": [
+        "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q8_0.bin --color -f ./prompts/alpaca.txt -p \"详细介绍一下北京的名胜古迹：\" -n 512"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "TPU",
+    "colab": {
+      "machine_shape": "hm",
+      "provenance": []
+    },
+    "gpuClass": "premium",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/notebooks/legacy/convert_and_quantize_chinese_llama.ipynb b/notebooks/legacy/convert_and_quantize_chinese_llama.ipynb
new file mode 100644
index 0000000..ce077f3
--- /dev/null
+++ b/notebooks/legacy/convert_and_quantize_chinese_llama.ipynb
@@ -0,0 +1,1874 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "gpuClass": "standard",
+    "accelerator": "TPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# 转换并量化中文LLaMA/Alpaca模型\n",
+        "\n",
+        "🎉🎉🎉 **新：现在免费用户也有机会能够转换7B和13B模型了！**\n",
+        "\n",
+        "💡 提示和小窍门：\n",
+        "- 免费用户默认的内存只有12G左右，**笔者用免费账号实测选择TPU的话有机会随机出35G内存**，建议多试几次。如果能随机出25G内存以上的机器就可以了转换7B模型了，35G内存以上机器就能转换13B模型了\n",
+        "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n",
+        "- 实测：转换7B级别模型，25G内存的机器就够了；转换13B级别模型需要30G以上的内存（程序莫名崩掉或断开连接就说明内存爆了）\n",
+        "- 如果选了“高RAM”之后内存还是不够大的话，选择以下操作，有的时候会分配出很高内存的机器，祝你好运😄！\n",
+        "    - 可以把GPU或者TPU也选上（虽然不会用到）\n",
+        "    - 选GPU时，Pro用户可选“高级”类型GPU\n",
+        "\n",
+        "以下信息配置信息供参考（Pro订阅下测试），运行时规格设置为“高RAM”时的设备配置如下（有随机性）：\n",
+        "\n",
+        "| 硬件加速器  |  RAM  |  硬盘  |\n",
+        "| :-- | :--: | :--: |\n",
+        "| None | 25GB | 225GB |\n",
+        "| TPU | 35GB | 225GB |\n",
+        "| GPU（标准，T4）| 25GB | 166GB |\n",
+        "| GPU（高性能，V100）| 25GB | 166GB |\n",
+        "| GPU（高性能，A100）| **80GB** | 166GB |\n",
+        "\n",
+        "*温馨提示：用完之后注意断开运行时，选择满足要求的最低配置即可，避免不必要的计算单元消耗（Pro只给100个计算单元）。*"
+      ],
+      "metadata": {
+        "id": "B1c96_k3MahN"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 安装相关依赖"
+      ],
+      "metadata": {
+        "id": "vScqHD_jMFOV"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "E5WKFJXIL6ZU",
+        "outputId": "7ce317e5-c105-49a8-d1af-70c29e6246e1"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting transformers\n",
+            "  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.0/7.0 MB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.27.1)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.24.2)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.65.0)\n",
+            "Collecting huggingface-hub<1.0,>=0.11.0\n",
+            "  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m200.1/200.1 kB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.11.0)\n",
+            "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
+            "  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m97.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2022.12.7)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.15)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (3.4)\n",
+            "Installing collected packages: tokenizers, huggingface-hub, transformers\n",
+            "Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting peft\n",
+            "  Downloading peft-0.2.0-py3-none-any.whl (40 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.3/40.3 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.9/dist-packages (from peft) (5.9.4)\n",
+            "Requirement already satisfied: transformers in /usr/local/lib/python3.9/dist-packages (from peft) (4.28.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.9/dist-packages (from peft) (6.0)\n",
+            "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.9/dist-packages (from peft) (2.0.0+cu118)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from peft) (23.0)\n",
+            "Collecting accelerate\n",
+            "  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.3/215.3 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from peft) (1.24.2)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (1.11.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.1.2)\n",
+            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (2.0.0)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.1)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.11.0)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (4.5.0)\n",
+            "Requirement already satisfied: lit in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch>=1.13.0->peft) (16.0.1)\n",
+            "Requirement already satisfied: cmake in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch>=1.13.0->peft) (3.25.2)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (0.13.4)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (2022.10.31)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (2.27.1)\n",
+            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (0.13.3)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (4.65.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->torch>=1.13.0->peft) (2.1.2)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (2.0.12)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (3.4)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (2022.12.7)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.9/dist-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\n",
+            "Installing collected packages: accelerate, peft\n",
+            "Successfully installed accelerate-0.18.0 peft-0.2.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting sentencepiece\n",
+            "  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: sentencepiece\n",
+            "Successfully installed sentencepiece-0.1.98\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install transformers\n",
+        "!pip install peft\n",
+        "!pip install sentencepiece"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 克隆目录和代码"
+      ],
+      "metadata": {
+        "id": "ygb1xFIMNQKw"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n",
+        "!git clone https://github.com/ggerganov/llama.cpp"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yCEJh7NJNXz9",
+        "outputId": "91a0e4ff-af63-4f8e-ab82-ee4ddf583033"
+      },
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Cloning into 'Chinese-LLaMA-Alpaca'...\n",
+            "remote: Enumerating objects: 559, done.\u001b[K\n",
+            "remote: Counting objects: 100% (129/129), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (115/115), done.\u001b[K\n",
+            "remote: Total 559 (delta 30), reused 22 (delta 14), pack-reused 430\u001b[K\n",
+            "Receiving objects: 100% (559/559), 10.71 MiB | 25.49 MiB/s, done.\n",
+            "Resolving deltas: 100% (333/333), done.\n",
+            "Cloning into 'llama.cpp'...\n",
+            "remote: Enumerating objects: 1701, done.\u001b[K\n",
+            "remote: Counting objects: 100% (1701/1701), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (620/620), done.\u001b[K\n",
+            "remote: Total 1701 (delta 1084), reused 1623 (delta 1047), pack-reused 0\u001b[K\n",
+            "Receiving objects: 100% (1701/1701), 1.86 MiB | 14.74 MiB/s, done.\n",
+            "Resolving deltas: 100% (1084/1084), done.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 合并模型（以Alpaca-7B为例）\n",
+        "\n",
+        "**⚠️ 再次提醒：7B模型需要25G内存，13B模型需要35G+内存。**\n",
+        "\n",
+        "此处使用的是🤗模型库中提供的基模型（已是HF格式），而不是Facebook官方的LLaMA模型，因此略去将原版LLaMA转换为HF格式的步骤。\n",
+        "\n",
+        "**这里直接运行第二步：合并LoRA权重**，生成全量模型权重。可以直接指定🤗模型库的地址，也可以是本地存放地址。\n",
+        "- 基模型：`decapoda-research/llama-7b-hf` *（use at your own risk）*\n",
+        "- LoRA模型：`ziqingyang/chinese-alpaca-lora-7b`\n",
+        "\n",
+        "💡 转换13B模型提示：\n",
+        "- 请将参数`--base_model`和`--lora_model`中的的`7b`改为`13b`即可\n",
+        "- **免费用户必须增加一个参数`--offload_dir`以缓解内存压力**，例如`--offload_dir ./offload_temp`\n",
+        "\n",
+        "该过程比较耗时（下载+转换），需要几分钟到十几分钟不等，请耐心等待。\n",
+        "转换好的模型存放在`alpaca-combined`目录。\n",
+        "如果你不需要量化模型，那么到这一步就结束了。"
+      ],
+      "metadata": {
+        "id": "nIyxX0DSNsgQ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora.py \\\n",
+        "    --base_model 'decapoda-research/llama-7b-hf' \\\n",
+        "    --lora_model 'ziqingyang/chinese-alpaca-lora-7b' \\\n",
+        "    --output_dir alpaca-combined"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5AV4EW5hNhVV",
+        "outputId": "e34419d4-b7c9-4e22-af37-abf80d4163ba"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "2023-04-14 10:13:45.382526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 12.7MB/s]\n",
+            "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 15.3kB/s]\n",
+            "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 63.2kB/s]\n",
+            "Downloading (…)lve/main/config.json: 100% 427/427 [00:00<00:00, 63.4kB/s]\n",
+            "Downloading (…)model.bin.index.json: 100% 25.5k/25.5k [00:00<00:00, 9.41MB/s]\n",
+            "Downloading shards:   0% 0/33 [00:00<?, ?it/s]\n",
+            "Downloading (…)l-00001-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 95.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 155MB/s] \u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  16% 62.9M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  31% 126M/405M [00:00<00:01, 205MB/s] \u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  39% 157M/405M [00:00<00:01, 208MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  47% 189M/405M [00:00<00:01, 210MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  54% 220M/405M [00:01<00:00, 213MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  62% 252M/405M [00:01<00:00, 214MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  70% 283M/405M [00:01<00:00, 215MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  78% 315M/405M [00:01<00:00, 216MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  85% 346M/405M [00:01<00:00, 216MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin:  93% 377M/405M [00:01<00:00, 214MB/s]\u001b[A\n",
+            "Downloading (…)l-00001-of-00033.bin: 100% 405M/405M [00:01<00:00, 205MB/s]\n",
+            "Downloading shards:   3% 1/33 [00:02<01:07,  2.11s/it]\n",
+            "Downloading (…)l-00002-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 91.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 150MB/s] \u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 186MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  28% 115M/405M [00:00<00:01, 192MB/s] \u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  34% 136M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  39% 157M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  44% 178M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  54% 220M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  60% 241M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  80% 325M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  85% 346M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00002-of-00033.bin: 100% 405M/405M [00:02<00:00, 194MB/s]\n",
+            "Downloading shards:   6% 2/33 [00:04<01:07,  2.17s/it]\n",
+            "Downloading (…)l-00003-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 90.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 143MB/s] \u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 166MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 179MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  28% 115M/405M [00:00<00:01, 190MB/s] \u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  34% 136M/405M [00:00<00:01, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  39% 157M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  44% 178M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  49% 199M/405M [00:01<00:01, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  54% 220M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  60% 241M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  65% 262M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  70% 283M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  75% 304M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  80% 325M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  85% 346M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00003-of-00033.bin: 100% 405M/405M [00:02<00:00, 187MB/s]\n",
+            "Downloading shards:   9% 3/33 [00:06<01:06,  2.23s/it]\n",
+            "Downloading (…)l-00004-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 175MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  28% 115M/405M [00:00<00:01, 195MB/s] \u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  34% 136M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  39% 157M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  44% 178M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  49% 199M/405M [00:01<00:01, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  54% 220M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  60% 241M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  65% 262M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  70% 283M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  80% 325M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  85% 346M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00004-of-00033.bin: 100% 405M/405M [00:02<00:00, 195MB/s]\n",
+            "Downloading shards:  12% 4/33 [00:08<01:04,  2.22s/it]\n",
+            "Downloading (…)l-00005-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 86.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 144MB/s] \u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 167MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 178MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  28% 115M/405M [00:00<00:01, 189MB/s] \u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  34% 136M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  39% 157M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  44% 178M/405M [00:00<00:01, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  49% 199M/405M [00:01<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  54% 220M/405M [00:01<00:00, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  60% 241M/405M [00:01<00:00, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  65% 262M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  70% 283M/405M [00:01<00:00, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  75% 304M/405M [00:01<00:00, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  80% 325M/405M [00:01<00:00, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  85% 346M/405M [00:01<00:00, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin:  91% 367M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00005-of-00033.bin: 100% 405M/405M [00:02<00:00, 188MB/s]\n",
+            "Downloading shards:  15% 5/33 [00:11<01:03,  2.26s/it]\n",
+            "Downloading (…)l-00006-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 90.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 150MB/s] \u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 186MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  28% 115M/405M [00:00<00:01, 190MB/s] \u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  34% 136M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  39% 157M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  44% 178M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  49% 199M/405M [00:01<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  54% 220M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  60% 241M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  65% 262M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  70% 283M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  75% 304M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  80% 325M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  85% 346M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin:  91% 367M/405M [00:01<00:00, 191MB/s]\u001b[A\n",
+            "Downloading (…)l-00006-of-00033.bin: 100% 405M/405M [00:02<00:00, 190MB/s]\n",
+            "Downloading shards:  18% 6/33 [00:13<01:01,  2.26s/it]\n",
+            "Downloading (…)l-00007-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 176MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  28% 115M/405M [00:00<00:01, 198MB/s] \u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  34% 136M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  39% 157M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  44% 178M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  49% 199M/405M [00:01<00:01, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  54% 220M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  60% 241M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  65% 262M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  70% 283M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  75% 304M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  80% 325M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  85% 346M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin:  91% 367M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00007-of-00033.bin: 100% 405M/405M [00:02<00:00, 197MB/s]\n",
+            "Downloading shards:  21% 7/33 [00:15<00:58,  2.24s/it]\n",
+            "Downloading (…)l-00008-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 153MB/s] \u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  28% 115M/405M [00:00<00:01, 199MB/s] \u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  34% 136M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  39% 157M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  44% 178M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  54% 220M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  60% 241M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  75% 304M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  80% 325M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  85% 346M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin:  91% 367M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00008-of-00033.bin: 100% 405M/405M [00:02<00:00, 197MB/s]\n",
+            "Downloading shards:  24% 8/33 [00:17<00:55,  2.22s/it]\n",
+            "Downloading (…)l-00009-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 89.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 149MB/s] \u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  28% 115M/405M [00:00<00:01, 195MB/s] \u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  34% 136M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  39% 157M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  44% 178M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  49% 199M/405M [00:01<00:01, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  54% 220M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  60% 241M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  65% 262M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  70% 283M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  80% 325M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  85% 346M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00009-of-00033.bin: 100% 405M/405M [00:02<00:00, 194MB/s]\n",
+            "Downloading shards:  27% 9/33 [00:20<00:53,  2.22s/it]\n",
+            "Downloading (…)l-00010-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 175MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 186MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  28% 115M/405M [00:00<00:01, 196MB/s] \u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  34% 136M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  39% 157M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  44% 178M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  52% 210M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  57% 231M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  62% 252M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  67% 273M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  73% 294M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  78% 315M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  83% 336M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  88% 357M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin:  93% 377M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00010-of-00033.bin: 100% 405M/405M [00:02<00:00, 196MB/s]\n",
+            "Downloading shards:  30% 10/33 [00:22<00:50,  2.22s/it]\n",
+            "Downloading (…)l-00011-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 149MB/s] \u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 183MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  28% 115M/405M [00:00<00:01, 195MB/s] \u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  34% 136M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  39% 157M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  44% 178M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  49% 199M/405M [00:01<00:01, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  54% 220M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  60% 241M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  80% 325M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  85% 346M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00011-of-00033.bin: 100% 405M/405M [00:02<00:00, 195MB/s]\n",
+            "Downloading shards:  33% 11/33 [00:24<00:48,  2.21s/it]\n",
+            "Downloading (…)l-00012-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 86.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 143MB/s] \u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 165MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  28% 115M/405M [00:00<00:01, 189MB/s] \u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  34% 136M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  39% 157M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  44% 178M/405M [00:00<00:01, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  49% 199M/405M [00:01<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  54% 220M/405M [00:01<00:00, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  60% 241M/405M [00:01<00:00, 191MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  65% 262M/405M [00:01<00:00, 191MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  70% 283M/405M [00:01<00:00, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  75% 304M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  80% 325M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  85% 346M/405M [00:01<00:00, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin:  91% 367M/405M [00:01<00:00, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00012-of-00033.bin: 100% 405M/405M [00:02<00:00, 186MB/s]\n",
+            "Downloading shards:  36% 12/33 [00:26<00:47,  2.25s/it]\n",
+            "Downloading (…)l-00013-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 151MB/s] \u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 175MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  28% 115M/405M [00:00<00:01, 197MB/s] \u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  34% 136M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  39% 157M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  44% 178M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  49% 199M/405M [00:01<00:01, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  54% 220M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  60% 241M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  75% 304M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  80% 325M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  85% 346M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin:  91% 367M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00013-of-00033.bin: 100% 405M/405M [00:02<00:00, 195MB/s]\n",
+            "Downloading shards:  39% 13/33 [00:28<00:44,  2.23s/it]\n",
+            "Downloading (…)l-00014-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:   3% 10.5M/405M [00:02<01:50, 3.56MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:   5% 21.0M/405M [00:04<01:10, 5.46MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:   8% 31.5M/405M [00:04<00:50, 7.45MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  10% 41.9M/405M [00:05<00:37, 9.67MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  13% 52.4M/405M [00:06<00:29, 12.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  16% 62.9M/405M [00:06<00:22, 14.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  18% 73.4M/405M [00:06<00:18, 17.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  21% 83.9M/405M [00:07<00:16, 20.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  23% 94.4M/405M [00:07<00:14, 21.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  26% 105M/405M [00:07<00:12, 23.5MB/s] \u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  28% 115M/405M [00:08<00:11, 24.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  31% 126M/405M [00:08<00:10, 25.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  34% 136M/405M [00:09<00:10, 26.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  36% 147M/405M [00:09<00:09, 26.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  39% 157M/405M [00:09<00:09, 27.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  41% 168M/405M [00:10<00:08, 27.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  44% 178M/405M [00:10<00:08, 27.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  47% 189M/405M [00:10<00:07, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  49% 199M/405M [00:11<00:07, 27.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  52% 210M/405M [00:11<00:07, 27.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  54% 220M/405M [00:12<00:06, 27.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  57% 231M/405M [00:12<00:06, 27.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  60% 241M/405M [00:12<00:05, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  62% 252M/405M [00:13<00:05, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  65% 262M/405M [00:13<00:05, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  67% 273M/405M [00:13<00:04, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  70% 283M/405M [00:14<00:04, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  73% 294M/405M [00:14<00:03, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  75% 304M/405M [00:15<00:03, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  78% 315M/405M [00:15<00:03, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  80% 325M/405M [00:15<00:02, 27.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  83% 336M/405M [00:16<00:02, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  85% 346M/405M [00:16<00:02, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  88% 357M/405M [00:16<00:01, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  91% 367M/405M [00:17<00:01, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  93% 377M/405M [00:17<00:00, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  96% 388M/405M [00:18<00:00, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin:  98% 398M/405M [00:18<00:00, 27.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00014-of-00033.bin: 100% 405M/405M [00:18<00:00, 21.7MB/s]\n",
+            "Downloading shards:  42% 14/33 [00:48<02:19,  7.34s/it]\n",
+            "Downloading (…)l-00015-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:   3% 10.5M/405M [00:02<01:20, 4.90MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:   5% 21.0M/405M [00:03<00:54, 7.08MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:   8% 31.5M/405M [00:03<00:40, 9.31MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  10% 41.9M/405M [00:04<00:30, 11.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  13% 52.4M/405M [00:04<00:24, 14.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  16% 62.9M/405M [00:05<00:19, 17.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  18% 73.4M/405M [00:05<00:16, 19.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  21% 83.9M/405M [00:05<00:14, 21.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  23% 94.4M/405M [00:06<00:13, 23.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  26% 105M/405M [00:06<00:12, 24.4MB/s] \u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  28% 115M/405M [00:07<00:11, 25.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  31% 126M/405M [00:07<00:10, 26.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  34% 136M/405M [00:07<00:10, 26.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  36% 147M/405M [00:08<00:09, 26.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  39% 157M/405M [00:08<00:09, 27.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  41% 168M/405M [00:09<00:08, 27.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  44% 178M/405M [00:09<00:08, 27.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  47% 189M/405M [00:09<00:07, 27.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  49% 199M/405M [00:10<00:07, 27.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  52% 210M/405M [00:10<00:07, 27.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  54% 220M/405M [00:10<00:06, 27.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  57% 231M/405M [00:11<00:06, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  60% 241M/405M [00:11<00:05, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  62% 252M/405M [00:12<00:05, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  65% 262M/405M [00:12<00:05, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  67% 273M/405M [00:12<00:04, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  70% 283M/405M [00:13<00:04, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  73% 294M/405M [00:13<00:04, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  75% 304M/405M [00:13<00:03, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  78% 315M/405M [00:14<00:03, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  80% 325M/405M [00:14<00:02, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  83% 336M/405M [00:15<00:02, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  85% 346M/405M [00:15<00:02, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  88% 357M/405M [00:15<00:01, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  91% 367M/405M [00:16<00:01, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  93% 377M/405M [00:16<00:00, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  96% 388M/405M [00:16<00:00, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin:  98% 398M/405M [00:17<00:00, 27.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00015-of-00033.bin: 100% 405M/405M [00:17<00:00, 23.0MB/s]\n",
+            "Downloading shards:  45% 15/33 [01:06<03:10, 10.56s/it]\n",
+            "Downloading (…)l-00016-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 91.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 144MB/s] \u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 171MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  28% 115M/405M [00:00<00:01, 191MB/s] \u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  36% 147M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  41% 168M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  47% 189M/405M [00:00<00:01, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  52% 210M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  57% 231M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  62% 252M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  67% 273M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  73% 294M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  78% 315M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  83% 336M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  88% 357M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin:  93% 377M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00016-of-00033.bin: 100% 405M/405M [00:02<00:00, 196MB/s]\n",
+            "Downloading shards:  48% 16/33 [01:08<02:16,  8.06s/it]\n",
+            "Downloading (…)l-00017-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 90.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 143MB/s] \u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 169MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 183MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 182MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  28% 115M/405M [00:00<00:01, 189MB/s] \u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  34% 136M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  39% 157M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  44% 178M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  49% 199M/405M [00:01<00:01, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  54% 220M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  60% 241M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  65% 262M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  70% 283M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  75% 304M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  80% 325M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  85% 346M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin:  91% 367M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00017-of-00033.bin: 100% 405M/405M [00:02<00:00, 194MB/s]\n",
+            "Downloading shards:  52% 17/33 [01:10<01:40,  6.30s/it]\n",
+            "Downloading (…)l-00018-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 89.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 144MB/s] \u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 170MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 183MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  28% 115M/405M [00:00<00:01, 194MB/s] \u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  34% 136M/405M [00:00<00:01, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  39% 157M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  44% 178M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  54% 220M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  60% 241M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  65% 262M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  70% 283M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  75% 304M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  80% 325M/405M [00:01<00:00, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  85% 346M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin:  91% 367M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00018-of-00033.bin: 100% 405M/405M [00:02<00:00, 192MB/s]\n",
+            "Downloading shards:  55% 18/33 [01:12<01:16,  5.09s/it]\n",
+            "Downloading (…)l-00019-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 85.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 142MB/s] \u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 167MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  28% 115M/405M [00:00<00:01, 185MB/s] \u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  34% 136M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  39% 157M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  44% 178M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  49% 199M/405M [00:01<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  54% 220M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  60% 241M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  65% 262M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  70% 283M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  75% 304M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  80% 325M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  85% 346M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin:  91% 367M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00019-of-00033.bin: 100% 405M/405M [00:02<00:00, 189MB/s]\n",
+            "Downloading shards:  58% 19/33 [01:15<00:59,  4.24s/it]\n",
+            "Downloading (…)l-00020-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 85.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 144MB/s] \u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 169MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 181MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  28% 115M/405M [00:00<00:01, 191MB/s] \u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  34% 136M/405M [00:00<00:01, 192MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  39% 157M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  44% 178M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  49% 199M/405M [00:01<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  54% 220M/405M [00:01<00:00, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  60% 241M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  65% 262M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  70% 283M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  75% 304M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  80% 325M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  88% 357M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin:  93% 377M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00020-of-00033.bin: 100% 405M/405M [00:02<00:00, 192MB/s]\n",
+            "Downloading shards:  61% 20/33 [01:17<00:47,  3.64s/it]\n",
+            "Downloading (…)l-00021-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 84.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 141MB/s] \u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 168MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 181MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  28% 115M/405M [00:00<00:01, 192MB/s] \u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  34% 136M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  39% 157M/405M [00:00<00:01, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  44% 178M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  54% 220M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  60% 241M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  65% 262M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  70% 283M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  75% 304M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  80% 325M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  85% 346M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin:  91% 367M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00021-of-00033.bin: 100% 405M/405M [00:02<00:00, 196MB/s]\n",
+            "Downloading shards:  64% 21/33 [01:19<00:38,  3.21s/it]\n",
+            "Downloading (…)l-00022-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 89.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 147MB/s] \u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 169MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 179MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 189MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  28% 115M/405M [00:00<00:01, 194MB/s] \u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  34% 136M/405M [00:00<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  39% 157M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  44% 178M/405M [00:00<00:01, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  49% 199M/405M [00:01<00:01, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  54% 220M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  60% 241M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  65% 262M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  75% 304M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  80% 325M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  85% 346M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00022-of-00033.bin: 100% 405M/405M [00:02<00:00, 193MB/s]\n",
+            "Downloading shards:  67% 22/33 [01:21<00:32,  2.92s/it]\n",
+            "Downloading (…)l-00023-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 153MB/s] \u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 176MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  28% 115M/405M [00:00<00:01, 199MB/s] \u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  34% 136M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  39% 157M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  44% 178M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  49% 199M/405M [00:01<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  54% 220M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  60% 241M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  65% 262M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  70% 283M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  75% 304M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  80% 325M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  85% 346M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin:  91% 367M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00023-of-00033.bin: 100% 405M/405M [00:02<00:00, 197MB/s]\n",
+            "Downloading shards:  70% 23/33 [01:23<00:27,  2.70s/it]\n",
+            "Downloading (…)l-00024-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 93.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  21% 83.9M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  28% 115M/405M [00:00<00:01, 200MB/s] \u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  34% 136M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  41% 168M/405M [00:00<00:01, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  49% 199M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  57% 231M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  62% 252M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  67% 273M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  73% 294M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  80% 325M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  85% 346M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin:  93% 377M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00024-of-00033.bin: 100% 405M/405M [00:02<00:00, 199MB/s]\n",
+            "Downloading shards:  73% 24/33 [01:26<00:22,  2.54s/it]\n",
+            "Downloading (…)l-00025-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 152MB/s] \u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 175MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  28% 115M/405M [00:00<00:01, 195MB/s] \u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  34% 136M/405M [00:00<00:01, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  39% 157M/405M [00:00<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  44% 178M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  49% 199M/405M [00:01<00:01, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  54% 220M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  60% 241M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  65% 262M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  70% 283M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  75% 304M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  80% 325M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  85% 346M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin:  91% 367M/405M [00:01<00:00, 208MB/s]\u001b[A\n",
+            "Downloading (…)l-00025-of-00033.bin: 100% 405M/405M [00:02<00:00, 198MB/s]\n",
+            "Downloading shards:  76% 25/33 [01:28<00:19,  2.43s/it]\n",
+            "Downloading (…)l-00026-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 88.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 148MB/s] \u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 174MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  28% 115M/405M [00:00<00:01, 199MB/s] \u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  34% 136M/405M [00:00<00:01, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  39% 157M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  44% 178M/405M [00:00<00:01, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  49% 199M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  54% 220M/405M [00:01<00:00, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  60% 241M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  65% 262M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  70% 283M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  78% 315M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  83% 336M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  88% 357M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin:  93% 377M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00026-of-00033.bin: 100% 405M/405M [00:02<00:00, 196MB/s]\n",
+            "Downloading shards:  79% 26/33 [01:30<00:16,  2.37s/it]\n",
+            "Downloading (…)l-00027-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 153MB/s] \u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 177MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  31% 126M/405M [00:00<00:01, 203MB/s] \u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  36% 147M/405M [00:00<00:01, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  41% 168M/405M [00:00<00:01, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  47% 189M/405M [00:00<00:01, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  52% 210M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  57% 231M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  62% 252M/405M [00:01<00:00, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  67% 273M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  73% 294M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  78% 315M/405M [00:01<00:00, 204MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  83% 336M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  88% 357M/405M [00:01<00:00, 206MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin:  93% 377M/405M [00:01<00:00, 207MB/s]\u001b[A\n",
+            "Downloading (…)l-00027-of-00033.bin: 100% 405M/405M [00:02<00:00, 198MB/s]\n",
+            "Downloading shards:  82% 27/33 [01:32<00:13,  2.31s/it]\n",
+            "Downloading (…)l-00028-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 92.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 154MB/s] \u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  13% 52.4M/405M [00:00<00:01, 176MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 188MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  28% 115M/405M [00:00<00:01, 199MB/s] \u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  34% 136M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  39% 157M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  44% 178M/405M [00:00<00:01, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  49% 199M/405M [00:01<00:01, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  54% 220M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  60% 241M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  65% 262M/405M [00:01<00:00, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  70% 283M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  75% 304M/405M [00:01<00:00, 195MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  80% 325M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  85% 346M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin:  91% 367M/405M [00:01<00:00, 199MB/s]\u001b[A\n",
+            "Downloading (…)l-00028-of-00033.bin: 100% 405M/405M [00:02<00:00, 193MB/s]\n",
+            "Downloading shards:  85% 28/33 [01:34<00:11,  2.29s/it]\n",
+            "Downloading (…)l-00029-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 91.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 149MB/s] \u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 175MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 185MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  28% 115M/405M [00:00<00:01, 191MB/s] \u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  34% 136M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  39% 157M/405M [00:00<00:01, 194MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  44% 178M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  49% 199M/405M [00:01<00:01, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  54% 220M/405M [00:01<00:00, 198MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  60% 241M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  65% 262M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  70% 283M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  75% 304M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  80% 325M/405M [00:01<00:00, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  85% 346M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin:  91% 367M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00029-of-00033.bin: 100% 405M/405M [00:02<00:00, 192MB/s]\n",
+            "Downloading shards:  88% 29/33 [01:37<00:09,  2.28s/it]\n",
+            "Downloading (…)l-00030-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:   3% 10.5M/405M [00:00<00:04, 89.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:   8% 31.5M/405M [00:00<00:02, 149MB/s] \u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  13% 52.4M/405M [00:00<00:02, 173MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  18% 73.4M/405M [00:00<00:01, 186MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  23% 94.4M/405M [00:00<00:01, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  28% 115M/405M [00:00<00:01, 197MB/s] \u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  34% 136M/405M [00:00<00:01, 201MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  39% 157M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  44% 178M/405M [00:00<00:01, 203MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  49% 199M/405M [00:01<00:01, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  54% 220M/405M [00:01<00:00, 205MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  60% 241M/405M [00:01<00:00, 202MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  65% 262M/405M [00:01<00:00, 187MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  70% 283M/405M [00:01<00:00, 190MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  75% 304M/405M [00:01<00:00, 193MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  80% 325M/405M [00:01<00:00, 196MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  85% 346M/405M [00:01<00:00, 197MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin:  91% 367M/405M [00:01<00:00, 200MB/s]\u001b[A\n",
+            "Downloading (…)l-00030-of-00033.bin: 100% 405M/405M [00:02<00:00, 192MB/s]\n",
+            "Downloading shards:  91% 30/33 [01:39<00:06,  2.27s/it]\n",
+            "Downloading (…)l-00031-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:   3% 10.5M/405M [00:00<00:05, 70.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:   5% 21.0M/405M [00:00<00:04, 82.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:   8% 31.5M/405M [00:00<00:04, 84.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  10% 41.9M/405M [00:00<00:04, 87.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  13% 52.4M/405M [00:00<00:03, 89.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  16% 62.9M/405M [00:00<00:03, 90.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  18% 73.4M/405M [00:00<00:03, 86.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  21% 83.9M/405M [00:00<00:03, 87.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  23% 94.4M/405M [00:01<00:03, 78.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  26% 105M/405M [00:01<00:03, 80.1MB/s] \u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  28% 115M/405M [00:01<00:03, 84.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  31% 126M/405M [00:01<00:03, 82.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  34% 136M/405M [00:01<00:03, 85.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  36% 147M/405M [00:01<00:02, 88.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  39% 157M/405M [00:01<00:02, 87.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  41% 168M/405M [00:01<00:02, 87.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  44% 178M/405M [00:02<00:02, 90.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  47% 189M/405M [00:02<00:02, 90.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  49% 199M/405M [00:02<00:02, 89.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  52% 210M/405M [00:02<00:02, 92.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  54% 220M/405M [00:02<00:02, 88.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  57% 231M/405M [00:02<00:01, 90.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  60% 241M/405M [00:02<00:01, 91.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  62% 252M/405M [00:02<00:01, 91.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  65% 262M/405M [00:02<00:01, 91.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  67% 273M/405M [00:03<00:01, 91.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  70% 283M/405M [00:03<00:01, 93.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  73% 294M/405M [00:03<00:01, 93.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  75% 304M/405M [00:03<00:01, 94.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  78% 315M/405M [00:03<00:00, 92.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  80% 325M/405M [00:03<00:00, 91.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  83% 336M/405M [00:03<00:00, 91.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  85% 346M/405M [00:03<00:00, 89.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  88% 357M/405M [00:04<00:00, 91.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  91% 367M/405M [00:04<00:00, 92.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  93% 377M/405M [00:04<00:00, 93.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin:  96% 388M/405M [00:04<00:00, 93.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00031-of-00033.bin: 100% 405M/405M [00:04<00:00, 89.2MB/s]\n",
+            "Downloading shards:  94% 31/33 [01:44<00:06,  3.01s/it]\n",
+            "Downloading (…)l-00032-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:   3% 10.5M/405M [00:00<00:05, 72.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:   5% 21.0M/405M [00:00<00:04, 84.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:   8% 31.5M/405M [00:00<00:04, 90.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  10% 41.9M/405M [00:00<00:03, 92.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  13% 52.4M/405M [00:00<00:03, 92.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  16% 62.9M/405M [00:00<00:03, 93.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  18% 73.4M/405M [00:00<00:03, 93.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  21% 83.9M/405M [00:00<00:03, 94.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  23% 94.4M/405M [00:01<00:03, 94.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  26% 105M/405M [00:01<00:03, 96.0MB/s] \u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  28% 115M/405M [00:01<00:03, 96.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  31% 126M/405M [00:01<00:02, 94.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  34% 136M/405M [00:01<00:03, 88.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  36% 147M/405M [00:01<00:02, 88.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  39% 157M/405M [00:01<00:02, 90.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  41% 168M/405M [00:01<00:02, 91.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  44% 178M/405M [00:01<00:02, 92.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  47% 189M/405M [00:02<00:02, 91.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  49% 199M/405M [00:02<00:02, 91.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  52% 210M/405M [00:02<00:02, 92.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  54% 220M/405M [00:02<00:02, 89.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  57% 231M/405M [00:02<00:01, 90.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  60% 241M/405M [00:02<00:01, 90.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  62% 252M/405M [00:02<00:01, 90.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  65% 262M/405M [00:02<00:01, 91.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  67% 273M/405M [00:02<00:01, 93.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  70% 283M/405M [00:03<00:01, 93.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  73% 294M/405M [00:03<00:01, 94.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  75% 304M/405M [00:03<00:01, 93.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  78% 315M/405M [00:03<00:00, 92.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  80% 325M/405M [00:03<00:00, 93.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  83% 336M/405M [00:03<00:00, 91.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  85% 346M/405M [00:03<00:00, 89.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  88% 357M/405M [00:03<00:00, 92.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  91% 367M/405M [00:03<00:00, 92.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  93% 377M/405M [00:04<00:00, 93.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin:  96% 388M/405M [00:04<00:00, 94.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00032-of-00033.bin: 100% 405M/405M [00:04<00:00, 92.0MB/s]\n",
+            "Downloading shards:  97% 32/33 [01:48<00:03,  3.47s/it]\n",
+            "Downloading (…)l-00033-of-00033.bin:   0% 0.00/524M [00:00<?, ?B/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:   2% 10.5M/524M [00:01<00:50, 10.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:   4% 21.0M/524M [00:01<00:23, 21.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:   8% 41.9M/524M [00:01<00:10, 44.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  10% 52.4M/524M [00:01<00:08, 53.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  12% 62.9M/524M [00:01<00:07, 62.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  14% 73.4M/524M [00:01<00:06, 69.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  16% 83.9M/524M [00:01<00:05, 76.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  18% 94.4M/524M [00:01<00:05, 78.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  20% 105M/524M [00:01<00:05, 82.6MB/s] \u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  22% 115M/524M [00:02<00:04, 85.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  24% 126M/524M [00:02<00:04, 88.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  26% 136M/524M [00:02<00:04, 90.9MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  28% 147M/524M [00:02<00:04, 93.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  30% 157M/524M [00:02<00:03, 94.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  32% 168M/524M [00:02<00:03, 95.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  34% 178M/524M [00:02<00:03, 95.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  36% 189M/524M [00:02<00:03, 95.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  38% 199M/524M [00:02<00:03, 96.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  40% 210M/524M [00:03<00:03, 95.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  42% 220M/524M [00:03<00:03, 96.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  44% 231M/524M [00:03<00:03, 96.1MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  46% 241M/524M [00:03<00:02, 96.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  48% 252M/524M [00:03<00:02, 96.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  50% 262M/524M [00:03<00:02, 92.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  52% 273M/524M [00:03<00:02, 92.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  54% 283M/524M [00:03<00:02, 93.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  56% 294M/524M [00:03<00:02, 94.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  58% 304M/524M [00:04<00:02, 93.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  60% 315M/524M [00:04<00:02, 92.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  62% 325M/524M [00:04<00:02, 92.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  64% 336M/524M [00:04<00:02, 93.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  66% 346M/524M [00:04<00:01, 94.2MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  68% 357M/524M [00:04<00:01, 94.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  70% 367M/524M [00:04<00:01, 93.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  72% 377M/524M [00:04<00:01, 94.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  74% 388M/524M [00:04<00:01, 92.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  76% 398M/524M [00:05<00:01, 93.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  78% 409M/524M [00:05<00:01, 92.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  80% 419M/524M [00:05<00:01, 93.5MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  82% 430M/524M [00:05<00:00, 94.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  84% 440M/524M [00:05<00:00, 94.7MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  86% 451M/524M [00:05<00:00, 94.4MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  88% 461M/524M [00:05<00:00, 95.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  90% 472M/524M [00:05<00:00, 94.6MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  92% 482M/524M [00:05<00:00, 94.3MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  94% 493M/524M [00:06<00:00, 87.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  96% 503M/524M [00:06<00:00, 89.0MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin:  98% 514M/524M [00:06<00:00, 91.8MB/s]\u001b[A\n",
+            "Downloading (…)l-00033-of-00033.bin: 100% 524M/524M [00:06<00:00, 81.3MB/s]\n",
+            "Downloading shards: 100% 33/33 [01:55<00:00,  3.49s/it]\n",
+            "Loading checkpoint shards: 100% 33/33 [00:14<00:00,  2.30it/s]\n",
+            "Downloading (…)neration_config.json: 100% 124/124 [00:00<00:00, 18.4kB/s]\n",
+            "Extended vocabulary size: 49954\n",
+            "Loading LoRA for 7B model\n",
+            "Downloading (…)/adapter_config.json: 100% 472/472 [00:00<00:00, 166kB/s]\n",
+            "Downloading adapter_model.bin: 100% 858M/858M [00:08<00:00, 103MB/s]\n",
+            "Peft version: 0.2.0\n",
+            "Merging model\n",
+            "Saving shard 1 of 1 into alpaca-combined/consolidated.00.pth\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 量化模型\n",
+        "接下来我们使用[llama.cpp](https://github.com/ggerganov/llama.cpp)工具对上一步生成的全量版本权重进行转换，生成4-bit量化模型。\n",
+        "\n",
+        "### 编译工具\n",
+        "\n",
+        "首先对llama.cpp工具进行编译。"
+      ],
+      "metadata": {
+        "id": "ueexcKo-Q_EW"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && make"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_GbjsT2wRRCR",
+        "outputId": "8da3382c-6bff-4030-905b-bb4f622766d7"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "I llama.cpp build info: \n",
+            "I UNAME_S:  Linux\n",
+            "I UNAME_P:  x86_64\n",
+            "I UNAME_M:  x86_64\n",
+            "I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function -pthread -march=native -mtune=native\n",
+            "I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native\n",
+            "I LDFLAGS:  \n",
+            "I CC:       cc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "I CXX:      g++ (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "\n",
+            "cc  -I.              -O3 -DNDEBUG -std=c11   -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function -pthread -march=native -mtune=native   -c ggml.c -o ggml.o\n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c llama.cpp -o llama.o\n",
+            "In file included from \u001b[01m\u001b[Kllama.cpp:6\u001b[m\u001b[K:\n",
+            "\u001b[01m\u001b[Kllama_util.h:60:2:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kextra ‘\u001b[01m\u001b[K;\u001b[m\u001b[K’ [\u001b[01;35m\u001b[K-Wpedantic\u001b[m\u001b[K]\n",
+            "   60 | }\u001b[01;35m\u001b[K;\u001b[m\u001b[K\n",
+            "      |  \u001b[01;35m\u001b[K^\u001b[m\u001b[K\n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c examples/common.cpp -o common.o\n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/main/main.cpp ggml.o llama.o common.o -o main \n",
+            "\n",
+            "====  Run ./main -h for help.  ====\n",
+            "\n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize/quantize.cpp ggml.o llama.o -o quantize \n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity \n",
+            "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding \n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 模型转换为ggml格式（FP16）\n",
+        "\n",
+        "这一步，我们将模型转换为ggml格式（FP16）。\n",
+        "- 在这之前需要把`alpaca-combined`目录挪个位置，把模型文件放到`llama.cpp/zh-models/7B`下，把`tokenizer.model`放到`llama.cpp/zh-models`\n",
+        "- tokenizer在哪里？\n",
+        "    - `alpaca-combined`目录下有\n",
+        "    - 或者从以下网址下载：https://huggingface.co/ziqingyang/chinese-alpaca-lora-7b/resolve/main/tokenizer.model （注意，Alpaca和LLaMA的`tokenizer.model`不能混用！）\n",
+        "\n",
+        "💡 转换13B模型提示：\n",
+        "- tokenizer可以直接用7B的，13B和7B的相同\n",
+        "- Alpaca和LLaMA的`tokenizer.model`不能混用！\n",
+        "- 以下看到7B字样的都是文件夹名，与转换过程没有关系了，改不改都行"
+      ],
+      "metadata": {
+        "id": "gw2xpYC0RcQC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && mkdir zh-models && mv ../alpaca-combined zh-models/7B\n",
+        "!mv llama.cpp/zh-models/7B/tokenizer.model llama.cpp/zh-models/\n",
+        "!ls llama.cpp/zh-models/"
+      ],
+      "metadata": {
+        "id": "5KgnFVStRjio",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "09ba7058-e2fb-4ae1-8539-62228df6ea09"
+      },
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "7B  tokenizer.model\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && python convert.py zh-models/7B/"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "NUHeoTMQS1AQ",
+        "outputId": "356f9e70-d05d-42d3-ed8c-fc052e11a855"
+      },
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Loading model file zh-models/7B/consolidated.00.pth\n",
+            "Loading vocab file zh-models/tokenizer.model\n",
+            "Writing vocab...\n",
+            "[1/291] Writing tensor tok_embeddings.weight, size 49954 x 4096...\n",
+            "[2/291] Writing tensor norm.weight, size 4096...\n",
+            "[3/291] Writing tensor output.weight, size 49954 x 4096...\n",
+            "[4/291] Writing tensor layers.0.attention.wq.weight, size 4096 x 4096...\n",
+            "[5/291] Writing tensor layers.0.attention.wk.weight, size 4096 x 4096...\n",
+            "[6/291] Writing tensor layers.0.attention.wv.weight, size 4096 x 4096...\n",
+            "[7/291] Writing tensor layers.0.attention.wo.weight, size 4096 x 4096...\n",
+            "[8/291] Writing tensor layers.0.attention_norm.weight, size 4096...\n",
+            "[9/291] Writing tensor layers.0.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[10/291] Writing tensor layers.0.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[11/291] Writing tensor layers.0.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[12/291] Writing tensor layers.0.ffn_norm.weight, size 4096...\n",
+            "[13/291] Writing tensor layers.1.attention.wq.weight, size 4096 x 4096...\n",
+            "[14/291] Writing tensor layers.1.attention.wk.weight, size 4096 x 4096...\n",
+            "[15/291] Writing tensor layers.1.attention.wv.weight, size 4096 x 4096...\n",
+            "[16/291] Writing tensor layers.1.attention.wo.weight, size 4096 x 4096...\n",
+            "[17/291] Writing tensor layers.1.attention_norm.weight, size 4096...\n",
+            "[18/291] Writing tensor layers.1.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[19/291] Writing tensor layers.1.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[20/291] Writing tensor layers.1.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[21/291] Writing tensor layers.1.ffn_norm.weight, size 4096...\n",
+            "[22/291] Writing tensor layers.2.attention.wq.weight, size 4096 x 4096...\n",
+            "[23/291] Writing tensor layers.2.attention.wk.weight, size 4096 x 4096...\n",
+            "[24/291] Writing tensor layers.2.attention.wv.weight, size 4096 x 4096...\n",
+            "[25/291] Writing tensor layers.2.attention.wo.weight, size 4096 x 4096...\n",
+            "[26/291] Writing tensor layers.2.attention_norm.weight, size 4096...\n",
+            "[27/291] Writing tensor layers.2.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[28/291] Writing tensor layers.2.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[29/291] Writing tensor layers.2.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[30/291] Writing tensor layers.2.ffn_norm.weight, size 4096...\n",
+            "[31/291] Writing tensor layers.3.attention.wq.weight, size 4096 x 4096...\n",
+            "[32/291] Writing tensor layers.3.attention.wk.weight, size 4096 x 4096...\n",
+            "[33/291] Writing tensor layers.3.attention.wv.weight, size 4096 x 4096...\n",
+            "[34/291] Writing tensor layers.3.attention.wo.weight, size 4096 x 4096...\n",
+            "[35/291] Writing tensor layers.3.attention_norm.weight, size 4096...\n",
+            "[36/291] Writing tensor layers.3.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[37/291] Writing tensor layers.3.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[38/291] Writing tensor layers.3.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[39/291] Writing tensor layers.3.ffn_norm.weight, size 4096...\n",
+            "[40/291] Writing tensor layers.4.attention.wq.weight, size 4096 x 4096...\n",
+            "[41/291] Writing tensor layers.4.attention.wk.weight, size 4096 x 4096...\n",
+            "[42/291] Writing tensor layers.4.attention.wv.weight, size 4096 x 4096...\n",
+            "[43/291] Writing tensor layers.4.attention.wo.weight, size 4096 x 4096...\n",
+            "[44/291] Writing tensor layers.4.attention_norm.weight, size 4096...\n",
+            "[45/291] Writing tensor layers.4.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[46/291] Writing tensor layers.4.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[47/291] Writing tensor layers.4.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[48/291] Writing tensor layers.4.ffn_norm.weight, size 4096...\n",
+            "[49/291] Writing tensor layers.5.attention.wq.weight, size 4096 x 4096...\n",
+            "[50/291] Writing tensor layers.5.attention.wk.weight, size 4096 x 4096...\n",
+            "[51/291] Writing tensor layers.5.attention.wv.weight, size 4096 x 4096...\n",
+            "[52/291] Writing tensor layers.5.attention.wo.weight, size 4096 x 4096...\n",
+            "[53/291] Writing tensor layers.5.attention_norm.weight, size 4096...\n",
+            "[54/291] Writing tensor layers.5.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[55/291] Writing tensor layers.5.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[56/291] Writing tensor layers.5.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[57/291] Writing tensor layers.5.ffn_norm.weight, size 4096...\n",
+            "[58/291] Writing tensor layers.6.attention.wq.weight, size 4096 x 4096...\n",
+            "[59/291] Writing tensor layers.6.attention.wk.weight, size 4096 x 4096...\n",
+            "[60/291] Writing tensor layers.6.attention.wv.weight, size 4096 x 4096...\n",
+            "[61/291] Writing tensor layers.6.attention.wo.weight, size 4096 x 4096...\n",
+            "[62/291] Writing tensor layers.6.attention_norm.weight, size 4096...\n",
+            "[63/291] Writing tensor layers.6.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[64/291] Writing tensor layers.6.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[65/291] Writing tensor layers.6.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[66/291] Writing tensor layers.6.ffn_norm.weight, size 4096...\n",
+            "[67/291] Writing tensor layers.7.attention.wq.weight, size 4096 x 4096...\n",
+            "[68/291] Writing tensor layers.7.attention.wk.weight, size 4096 x 4096...\n",
+            "[69/291] Writing tensor layers.7.attention.wv.weight, size 4096 x 4096...\n",
+            "[70/291] Writing tensor layers.7.attention.wo.weight, size 4096 x 4096...\n",
+            "[71/291] Writing tensor layers.7.attention_norm.weight, size 4096...\n",
+            "[72/291] Writing tensor layers.7.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[73/291] Writing tensor layers.7.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[74/291] Writing tensor layers.7.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[75/291] Writing tensor layers.7.ffn_norm.weight, size 4096...\n",
+            "[76/291] Writing tensor layers.8.attention.wq.weight, size 4096 x 4096...\n",
+            "[77/291] Writing tensor layers.8.attention.wk.weight, size 4096 x 4096...\n",
+            "[78/291] Writing tensor layers.8.attention.wv.weight, size 4096 x 4096...\n",
+            "[79/291] Writing tensor layers.8.attention.wo.weight, size 4096 x 4096...\n",
+            "[80/291] Writing tensor layers.8.attention_norm.weight, size 4096...\n",
+            "[81/291] Writing tensor layers.8.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[82/291] Writing tensor layers.8.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[83/291] Writing tensor layers.8.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[84/291] Writing tensor layers.8.ffn_norm.weight, size 4096...\n",
+            "[85/291] Writing tensor layers.9.attention.wq.weight, size 4096 x 4096...\n",
+            "[86/291] Writing tensor layers.9.attention.wk.weight, size 4096 x 4096...\n",
+            "[87/291] Writing tensor layers.9.attention.wv.weight, size 4096 x 4096...\n",
+            "[88/291] Writing tensor layers.9.attention.wo.weight, size 4096 x 4096...\n",
+            "[89/291] Writing tensor layers.9.attention_norm.weight, size 4096...\n",
+            "[90/291] Writing tensor layers.9.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[91/291] Writing tensor layers.9.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[92/291] Writing tensor layers.9.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[93/291] Writing tensor layers.9.ffn_norm.weight, size 4096...\n",
+            "[94/291] Writing tensor layers.10.attention.wq.weight, size 4096 x 4096...\n",
+            "[95/291] Writing tensor layers.10.attention.wk.weight, size 4096 x 4096...\n",
+            "[96/291] Writing tensor layers.10.attention.wv.weight, size 4096 x 4096...\n",
+            "[97/291] Writing tensor layers.10.attention.wo.weight, size 4096 x 4096...\n",
+            "[98/291] Writing tensor layers.10.attention_norm.weight, size 4096...\n",
+            "[99/291] Writing tensor layers.10.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[100/291] Writing tensor layers.10.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[101/291] Writing tensor layers.10.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[102/291] Writing tensor layers.10.ffn_norm.weight, size 4096...\n",
+            "[103/291] Writing tensor layers.11.attention.wq.weight, size 4096 x 4096...\n",
+            "[104/291] Writing tensor layers.11.attention.wk.weight, size 4096 x 4096...\n",
+            "[105/291] Writing tensor layers.11.attention.wv.weight, size 4096 x 4096...\n",
+            "[106/291] Writing tensor layers.11.attention.wo.weight, size 4096 x 4096...\n",
+            "[107/291] Writing tensor layers.11.attention_norm.weight, size 4096...\n",
+            "[108/291] Writing tensor layers.11.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[109/291] Writing tensor layers.11.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[110/291] Writing tensor layers.11.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[111/291] Writing tensor layers.11.ffn_norm.weight, size 4096...\n",
+            "[112/291] Writing tensor layers.12.attention.wq.weight, size 4096 x 4096...\n",
+            "[113/291] Writing tensor layers.12.attention.wk.weight, size 4096 x 4096...\n",
+            "[114/291] Writing tensor layers.12.attention.wv.weight, size 4096 x 4096...\n",
+            "[115/291] Writing tensor layers.12.attention.wo.weight, size 4096 x 4096...\n",
+            "[116/291] Writing tensor layers.12.attention_norm.weight, size 4096...\n",
+            "[117/291] Writing tensor layers.12.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[118/291] Writing tensor layers.12.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[119/291] Writing tensor layers.12.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[120/291] Writing tensor layers.12.ffn_norm.weight, size 4096...\n",
+            "[121/291] Writing tensor layers.13.attention.wq.weight, size 4096 x 4096...\n",
+            "[122/291] Writing tensor layers.13.attention.wk.weight, size 4096 x 4096...\n",
+            "[123/291] Writing tensor layers.13.attention.wv.weight, size 4096 x 4096...\n",
+            "[124/291] Writing tensor layers.13.attention.wo.weight, size 4096 x 4096...\n",
+            "[125/291] Writing tensor layers.13.attention_norm.weight, size 4096...\n",
+            "[126/291] Writing tensor layers.13.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[127/291] Writing tensor layers.13.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[128/291] Writing tensor layers.13.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[129/291] Writing tensor layers.13.ffn_norm.weight, size 4096...\n",
+            "[130/291] Writing tensor layers.14.attention.wq.weight, size 4096 x 4096...\n",
+            "[131/291] Writing tensor layers.14.attention.wk.weight, size 4096 x 4096...\n",
+            "[132/291] Writing tensor layers.14.attention.wv.weight, size 4096 x 4096...\n",
+            "[133/291] Writing tensor layers.14.attention.wo.weight, size 4096 x 4096...\n",
+            "[134/291] Writing tensor layers.14.attention_norm.weight, size 4096...\n",
+            "[135/291] Writing tensor layers.14.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[136/291] Writing tensor layers.14.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[137/291] Writing tensor layers.14.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[138/291] Writing tensor layers.14.ffn_norm.weight, size 4096...\n",
+            "[139/291] Writing tensor layers.15.attention.wq.weight, size 4096 x 4096...\n",
+            "[140/291] Writing tensor layers.15.attention.wk.weight, size 4096 x 4096...\n",
+            "[141/291] Writing tensor layers.15.attention.wv.weight, size 4096 x 4096...\n",
+            "[142/291] Writing tensor layers.15.attention.wo.weight, size 4096 x 4096...\n",
+            "[143/291] Writing tensor layers.15.attention_norm.weight, size 4096...\n",
+            "[144/291] Writing tensor layers.15.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[145/291] Writing tensor layers.15.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[146/291] Writing tensor layers.15.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[147/291] Writing tensor layers.15.ffn_norm.weight, size 4096...\n",
+            "[148/291] Writing tensor layers.16.attention.wq.weight, size 4096 x 4096...\n",
+            "[149/291] Writing tensor layers.16.attention.wk.weight, size 4096 x 4096...\n",
+            "[150/291] Writing tensor layers.16.attention.wv.weight, size 4096 x 4096...\n",
+            "[151/291] Writing tensor layers.16.attention.wo.weight, size 4096 x 4096...\n",
+            "[152/291] Writing tensor layers.16.attention_norm.weight, size 4096...\n",
+            "[153/291] Writing tensor layers.16.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[154/291] Writing tensor layers.16.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[155/291] Writing tensor layers.16.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[156/291] Writing tensor layers.16.ffn_norm.weight, size 4096...\n",
+            "[157/291] Writing tensor layers.17.attention.wq.weight, size 4096 x 4096...\n",
+            "[158/291] Writing tensor layers.17.attention.wk.weight, size 4096 x 4096...\n",
+            "[159/291] Writing tensor layers.17.attention.wv.weight, size 4096 x 4096...\n",
+            "[160/291] Writing tensor layers.17.attention.wo.weight, size 4096 x 4096...\n",
+            "[161/291] Writing tensor layers.17.attention_norm.weight, size 4096...\n",
+            "[162/291] Writing tensor layers.17.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[163/291] Writing tensor layers.17.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[164/291] Writing tensor layers.17.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[165/291] Writing tensor layers.17.ffn_norm.weight, size 4096...\n",
+            "[166/291] Writing tensor layers.18.attention.wq.weight, size 4096 x 4096...\n",
+            "[167/291] Writing tensor layers.18.attention.wk.weight, size 4096 x 4096...\n",
+            "[168/291] Writing tensor layers.18.attention.wv.weight, size 4096 x 4096...\n",
+            "[169/291] Writing tensor layers.18.attention.wo.weight, size 4096 x 4096...\n",
+            "[170/291] Writing tensor layers.18.attention_norm.weight, size 4096...\n",
+            "[171/291] Writing tensor layers.18.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[172/291] Writing tensor layers.18.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[173/291] Writing tensor layers.18.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[174/291] Writing tensor layers.18.ffn_norm.weight, size 4096...\n",
+            "[175/291] Writing tensor layers.19.attention.wq.weight, size 4096 x 4096...\n",
+            "[176/291] Writing tensor layers.19.attention.wk.weight, size 4096 x 4096...\n",
+            "[177/291] Writing tensor layers.19.attention.wv.weight, size 4096 x 4096...\n",
+            "[178/291] Writing tensor layers.19.attention.wo.weight, size 4096 x 4096...\n",
+            "[179/291] Writing tensor layers.19.attention_norm.weight, size 4096...\n",
+            "[180/291] Writing tensor layers.19.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[181/291] Writing tensor layers.19.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[182/291] Writing tensor layers.19.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[183/291] Writing tensor layers.19.ffn_norm.weight, size 4096...\n",
+            "[184/291] Writing tensor layers.20.attention.wq.weight, size 4096 x 4096...\n",
+            "[185/291] Writing tensor layers.20.attention.wk.weight, size 4096 x 4096...\n",
+            "[186/291] Writing tensor layers.20.attention.wv.weight, size 4096 x 4096...\n",
+            "[187/291] Writing tensor layers.20.attention.wo.weight, size 4096 x 4096...\n",
+            "[188/291] Writing tensor layers.20.attention_norm.weight, size 4096...\n",
+            "[189/291] Writing tensor layers.20.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[190/291] Writing tensor layers.20.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[191/291] Writing tensor layers.20.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[192/291] Writing tensor layers.20.ffn_norm.weight, size 4096...\n",
+            "[193/291] Writing tensor layers.21.attention.wq.weight, size 4096 x 4096...\n",
+            "[194/291] Writing tensor layers.21.attention.wk.weight, size 4096 x 4096...\n",
+            "[195/291] Writing tensor layers.21.attention.wv.weight, size 4096 x 4096...\n",
+            "[196/291] Writing tensor layers.21.attention.wo.weight, size 4096 x 4096...\n",
+            "[197/291] Writing tensor layers.21.attention_norm.weight, size 4096...\n",
+            "[198/291] Writing tensor layers.21.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[199/291] Writing tensor layers.21.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[200/291] Writing tensor layers.21.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[201/291] Writing tensor layers.21.ffn_norm.weight, size 4096...\n",
+            "[202/291] Writing tensor layers.22.attention.wq.weight, size 4096 x 4096...\n",
+            "[203/291] Writing tensor layers.22.attention.wk.weight, size 4096 x 4096...\n",
+            "[204/291] Writing tensor layers.22.attention.wv.weight, size 4096 x 4096...\n",
+            "[205/291] Writing tensor layers.22.attention.wo.weight, size 4096 x 4096...\n",
+            "[206/291] Writing tensor layers.22.attention_norm.weight, size 4096...\n",
+            "[207/291] Writing tensor layers.22.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[208/291] Writing tensor layers.22.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[209/291] Writing tensor layers.22.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[210/291] Writing tensor layers.22.ffn_norm.weight, size 4096...\n",
+            "[211/291] Writing tensor layers.23.attention.wq.weight, size 4096 x 4096...\n",
+            "[212/291] Writing tensor layers.23.attention.wk.weight, size 4096 x 4096...\n",
+            "[213/291] Writing tensor layers.23.attention.wv.weight, size 4096 x 4096...\n",
+            "[214/291] Writing tensor layers.23.attention.wo.weight, size 4096 x 4096...\n",
+            "[215/291] Writing tensor layers.23.attention_norm.weight, size 4096...\n",
+            "[216/291] Writing tensor layers.23.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[217/291] Writing tensor layers.23.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[218/291] Writing tensor layers.23.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[219/291] Writing tensor layers.23.ffn_norm.weight, size 4096...\n",
+            "[220/291] Writing tensor layers.24.attention.wq.weight, size 4096 x 4096...\n",
+            "[221/291] Writing tensor layers.24.attention.wk.weight, size 4096 x 4096...\n",
+            "[222/291] Writing tensor layers.24.attention.wv.weight, size 4096 x 4096...\n",
+            "[223/291] Writing tensor layers.24.attention.wo.weight, size 4096 x 4096...\n",
+            "[224/291] Writing tensor layers.24.attention_norm.weight, size 4096...\n",
+            "[225/291] Writing tensor layers.24.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[226/291] Writing tensor layers.24.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[227/291] Writing tensor layers.24.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[228/291] Writing tensor layers.24.ffn_norm.weight, size 4096...\n",
+            "[229/291] Writing tensor layers.25.attention.wq.weight, size 4096 x 4096...\n",
+            "[230/291] Writing tensor layers.25.attention.wk.weight, size 4096 x 4096...\n",
+            "[231/291] Writing tensor layers.25.attention.wv.weight, size 4096 x 4096...\n",
+            "[232/291] Writing tensor layers.25.attention.wo.weight, size 4096 x 4096...\n",
+            "[233/291] Writing tensor layers.25.attention_norm.weight, size 4096...\n",
+            "[234/291] Writing tensor layers.25.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[235/291] Writing tensor layers.25.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[236/291] Writing tensor layers.25.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[237/291] Writing tensor layers.25.ffn_norm.weight, size 4096...\n",
+            "[238/291] Writing tensor layers.26.attention.wq.weight, size 4096 x 4096...\n",
+            "[239/291] Writing tensor layers.26.attention.wk.weight, size 4096 x 4096...\n",
+            "[240/291] Writing tensor layers.26.attention.wv.weight, size 4096 x 4096...\n",
+            "[241/291] Writing tensor layers.26.attention.wo.weight, size 4096 x 4096...\n",
+            "[242/291] Writing tensor layers.26.attention_norm.weight, size 4096...\n",
+            "[243/291] Writing tensor layers.26.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[244/291] Writing tensor layers.26.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[245/291] Writing tensor layers.26.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[246/291] Writing tensor layers.26.ffn_norm.weight, size 4096...\n",
+            "[247/291] Writing tensor layers.27.attention.wq.weight, size 4096 x 4096...\n",
+            "[248/291] Writing tensor layers.27.attention.wk.weight, size 4096 x 4096...\n",
+            "[249/291] Writing tensor layers.27.attention.wv.weight, size 4096 x 4096...\n",
+            "[250/291] Writing tensor layers.27.attention.wo.weight, size 4096 x 4096...\n",
+            "[251/291] Writing tensor layers.27.attention_norm.weight, size 4096...\n",
+            "[252/291] Writing tensor layers.27.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[253/291] Writing tensor layers.27.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[254/291] Writing tensor layers.27.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[255/291] Writing tensor layers.27.ffn_norm.weight, size 4096...\n",
+            "[256/291] Writing tensor layers.28.attention.wq.weight, size 4096 x 4096...\n",
+            "[257/291] Writing tensor layers.28.attention.wk.weight, size 4096 x 4096...\n",
+            "[258/291] Writing tensor layers.28.attention.wv.weight, size 4096 x 4096...\n",
+            "[259/291] Writing tensor layers.28.attention.wo.weight, size 4096 x 4096...\n",
+            "[260/291] Writing tensor layers.28.attention_norm.weight, size 4096...\n",
+            "[261/291] Writing tensor layers.28.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[262/291] Writing tensor layers.28.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[263/291] Writing tensor layers.28.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[264/291] Writing tensor layers.28.ffn_norm.weight, size 4096...\n",
+            "[265/291] Writing tensor layers.29.attention.wq.weight, size 4096 x 4096...\n",
+            "[266/291] Writing tensor layers.29.attention.wk.weight, size 4096 x 4096...\n",
+            "[267/291] Writing tensor layers.29.attention.wv.weight, size 4096 x 4096...\n",
+            "[268/291] Writing tensor layers.29.attention.wo.weight, size 4096 x 4096...\n",
+            "[269/291] Writing tensor layers.29.attention_norm.weight, size 4096...\n",
+            "[270/291] Writing tensor layers.29.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[271/291] Writing tensor layers.29.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[272/291] Writing tensor layers.29.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[273/291] Writing tensor layers.29.ffn_norm.weight, size 4096...\n",
+            "[274/291] Writing tensor layers.30.attention.wq.weight, size 4096 x 4096...\n",
+            "[275/291] Writing tensor layers.30.attention.wk.weight, size 4096 x 4096...\n",
+            "[276/291] Writing tensor layers.30.attention.wv.weight, size 4096 x 4096...\n",
+            "[277/291] Writing tensor layers.30.attention.wo.weight, size 4096 x 4096...\n",
+            "[278/291] Writing tensor layers.30.attention_norm.weight, size 4096...\n",
+            "[279/291] Writing tensor layers.30.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[280/291] Writing tensor layers.30.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[281/291] Writing tensor layers.30.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[282/291] Writing tensor layers.30.ffn_norm.weight, size 4096...\n",
+            "[283/291] Writing tensor layers.31.attention.wq.weight, size 4096 x 4096...\n",
+            "[284/291] Writing tensor layers.31.attention.wk.weight, size 4096 x 4096...\n",
+            "[285/291] Writing tensor layers.31.attention.wv.weight, size 4096 x 4096...\n",
+            "[286/291] Writing tensor layers.31.attention.wo.weight, size 4096 x 4096...\n",
+            "[287/291] Writing tensor layers.31.attention_norm.weight, size 4096...\n",
+            "[288/291] Writing tensor layers.31.feed_forward.w1.weight, size 11008 x 4096...\n",
+            "[289/291] Writing tensor layers.31.feed_forward.w2.weight, size 4096 x 11008...\n",
+            "[290/291] Writing tensor layers.31.feed_forward.w3.weight, size 11008 x 4096...\n",
+            "[291/291] Writing tensor layers.31.ffn_norm.weight, size 4096...\n",
+            "Wrote zh-models/7B/ggml-model-f16.bin\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 将FP16模型量化为4-bit\n",
+        "\n",
+        "我们进一步将FP16模型转换为4-bit量化模型。"
+      ],
+      "metadata": {
+        "id": "hEZEJAVYCHkc"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && ./quantize ./zh-models/7B/ggml-model-f16.bin ./zh-models/7B/ggml-model-q4_0.bin 2"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "2xyais7OUVDI",
+        "outputId": "99b4154e-1370-4240-c06b-69ff2f49ee37"
+      },
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "llama.cpp: loading model from ./zh-models/7B/ggml-model-f16.bin\n",
+            "llama.cpp: saving model to ./zh-models/7B/ggml-model-q4_0.bin\n",
+            "[1/291]                tok_embeddings.weight - [4096 x 49954], type =    f16, quantizing .. size =   390.27 MB ->   121.96 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[2/291]                          norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[3/291]                        output.weight - [4096 x 49954], type =    f16, quantizing .. size =   390.27 MB ->   121.96 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[4/291]         layers.0.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.071 0.103 0.137 0.158 0.137 0.103 0.071 0.046 0.028 0.016 0.021 \n",
+            "[5/291]         layers.0.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.016 0.027 0.045 0.071 0.104 0.138 0.158 0.139 0.104 0.071 0.045 0.027 0.016 0.021 \n",
+            "[6/291]         layers.0.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.018 0.032 0.051 0.076 0.103 0.128 0.141 0.128 0.103 0.075 0.051 0.032 0.019 0.022 \n",
+            "[7/291]         layers.0.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.072 0.105 0.136 0.151 0.136 0.105 0.072 0.046 0.028 0.016 0.021 \n",
+            "[8/291]       layers.0.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[9/291]      layers.0.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[10/291]      layers.0.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[11/291]      layers.0.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[12/291]             layers.0.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[13/291]         layers.1.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.051 0.077 0.104 0.127 0.137 0.127 0.104 0.077 0.051 0.032 0.019 0.022 \n",
+            "[14/291]         layers.1.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.018 0.032 0.051 0.076 0.104 0.128 0.138 0.128 0.104 0.077 0.051 0.032 0.018 0.022 \n",
+            "[15/291]         layers.1.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.018 0.031 0.051 0.076 0.104 0.129 0.139 0.129 0.104 0.076 0.051 0.031 0.018 0.021 \n",
+            "[16/291]         layers.1.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.071 0.104 0.137 0.154 0.137 0.104 0.071 0.046 0.028 0.016 0.021 \n",
+            "[17/291]       layers.1.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[18/291]      layers.1.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[19/291]      layers.1.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[20/291]      layers.1.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[21/291]             layers.1.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[22/291]         layers.2.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[23/291]         layers.2.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.051 0.076 0.104 0.127 0.137 0.127 0.104 0.077 0.051 0.032 0.019 0.022 \n",
+            "[24/291]         layers.2.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.136 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[25/291]         layers.2.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
+            "[26/291]       layers.2.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[27/291]      layers.2.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[28/291]      layers.2.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[29/291]      layers.2.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[30/291]             layers.2.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[31/291]         layers.3.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
+            "[32/291]         layers.3.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.136 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
+            "[33/291]         layers.3.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[34/291]         layers.3.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[35/291]       layers.3.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[36/291]      layers.3.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[37/291]      layers.3.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[38/291]      layers.3.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[39/291]             layers.3.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[40/291]         layers.4.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[41/291]         layers.4.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
+            "[42/291]         layers.4.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[43/291]         layers.4.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[44/291]       layers.4.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[45/291]      layers.4.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[46/291]      layers.4.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[47/291]      layers.4.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[48/291]             layers.4.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[49/291]         layers.5.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[50/291]         layers.5.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[51/291]         layers.5.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[52/291]         layers.5.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[53/291]       layers.5.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[54/291]      layers.5.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[55/291]      layers.5.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[56/291]      layers.5.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[57/291]             layers.5.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[58/291]         layers.6.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[59/291]         layers.6.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[60/291]         layers.6.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[61/291]         layers.6.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[62/291]       layers.6.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[63/291]      layers.6.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[64/291]      layers.6.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[65/291]      layers.6.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[66/291]             layers.6.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[67/291]         layers.7.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[68/291]         layers.7.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[69/291]         layers.7.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[70/291]         layers.7.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[71/291]       layers.7.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[72/291]      layers.7.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[73/291]      layers.7.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[74/291]      layers.7.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[75/291]             layers.7.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[76/291]         layers.8.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[77/291]         layers.8.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n",
+            "[78/291]         layers.8.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[79/291]         layers.8.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[80/291]       layers.8.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[81/291]      layers.8.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[82/291]      layers.8.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n",
+            "[83/291]      layers.8.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[84/291]             layers.8.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[85/291]         layers.9.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[86/291]         layers.9.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[87/291]         layers.9.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[88/291]         layers.9.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[89/291]       layers.9.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[90/291]      layers.9.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[91/291]      layers.9.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[92/291]      layers.9.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[93/291]             layers.9.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[94/291]        layers.10.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[95/291]        layers.10.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[96/291]        layers.10.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[97/291]        layers.10.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[98/291]      layers.10.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[99/291]     layers.10.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[100/291]     layers.10.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[101/291]     layers.10.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[102/291]            layers.10.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[103/291]        layers.11.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[104/291]        layers.11.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[105/291]        layers.11.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[106/291]        layers.11.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[107/291]      layers.11.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[108/291]     layers.11.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[109/291]     layers.11.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
+            "[110/291]     layers.11.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[111/291]            layers.11.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[112/291]        layers.12.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[113/291]        layers.12.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[114/291]        layers.12.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[115/291]        layers.12.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[116/291]      layers.12.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[117/291]     layers.12.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[118/291]     layers.12.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[119/291]     layers.12.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[120/291]            layers.12.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[121/291]        layers.13.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[122/291]        layers.13.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n",
+            "[123/291]        layers.13.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[124/291]        layers.13.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[125/291]      layers.13.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[126/291]     layers.13.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[127/291]     layers.13.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n",
+            "[128/291]     layers.13.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[129/291]            layers.13.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[130/291]        layers.14.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[131/291]        layers.14.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[132/291]        layers.14.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[133/291]        layers.14.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[134/291]      layers.14.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[135/291]     layers.14.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[136/291]     layers.14.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[137/291]     layers.14.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[138/291]            layers.14.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[139/291]        layers.15.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[140/291]        layers.15.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[141/291]        layers.15.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[142/291]        layers.15.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[143/291]      layers.15.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[144/291]     layers.15.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[145/291]     layers.15.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[146/291]     layers.15.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[147/291]            layers.15.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[148/291]        layers.16.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[149/291]        layers.16.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[150/291]        layers.16.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[151/291]        layers.16.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[152/291]      layers.16.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[153/291]     layers.16.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[154/291]     layers.16.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n",
+            "[155/291]     layers.16.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[156/291]            layers.16.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[157/291]        layers.17.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[158/291]        layers.17.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[159/291]        layers.17.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[160/291]        layers.17.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[161/291]      layers.17.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[162/291]     layers.17.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[163/291]     layers.17.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n",
+            "[164/291]     layers.17.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[165/291]            layers.17.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[166/291]        layers.18.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[167/291]        layers.18.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[168/291]        layers.18.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[169/291]        layers.18.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[170/291]      layers.18.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[171/291]     layers.18.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[172/291]     layers.18.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[173/291]     layers.18.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[174/291]            layers.18.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[175/291]        layers.19.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[176/291]        layers.19.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[177/291]        layers.19.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[178/291]        layers.19.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[179/291]      layers.19.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[180/291]     layers.19.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[181/291]     layers.19.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[182/291]     layers.19.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[183/291]            layers.19.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[184/291]        layers.20.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[185/291]        layers.20.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[186/291]        layers.20.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[187/291]        layers.20.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[188/291]      layers.20.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[189/291]     layers.20.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[190/291]     layers.20.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[191/291]     layers.20.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[192/291]            layers.20.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[193/291]        layers.21.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[194/291]        layers.21.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[195/291]        layers.21.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[196/291]        layers.21.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[197/291]      layers.21.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[198/291]     layers.21.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[199/291]     layers.21.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[200/291]     layers.21.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[201/291]            layers.21.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[202/291]        layers.22.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[203/291]        layers.22.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[204/291]        layers.22.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[205/291]        layers.22.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[206/291]      layers.22.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[207/291]     layers.22.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[208/291]     layers.22.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[209/291]     layers.22.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[210/291]            layers.22.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[211/291]        layers.23.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[212/291]        layers.23.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[213/291]        layers.23.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[214/291]        layers.23.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[215/291]      layers.23.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[216/291]     layers.23.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[217/291]     layers.23.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[218/291]     layers.23.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[219/291]            layers.23.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[220/291]        layers.24.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[221/291]        layers.24.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[222/291]        layers.24.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[223/291]        layers.24.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[224/291]      layers.24.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[225/291]     layers.24.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[226/291]     layers.24.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[227/291]     layers.24.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[228/291]            layers.24.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[229/291]        layers.25.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[230/291]        layers.25.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[231/291]        layers.25.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[232/291]        layers.25.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[233/291]      layers.25.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[234/291]     layers.25.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[235/291]     layers.25.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[236/291]     layers.25.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[237/291]            layers.25.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[238/291]        layers.26.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[239/291]        layers.26.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[240/291]        layers.26.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[241/291]        layers.26.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[242/291]      layers.26.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[243/291]     layers.26.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[244/291]     layers.26.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[245/291]     layers.26.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[246/291]            layers.26.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[247/291]        layers.27.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[248/291]        layers.27.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[249/291]        layers.27.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[250/291]        layers.27.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[251/291]      layers.27.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[252/291]     layers.27.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[253/291]     layers.27.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[254/291]     layers.27.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[255/291]            layers.27.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[256/291]        layers.28.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[257/291]        layers.28.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[258/291]        layers.28.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[259/291]        layers.28.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[260/291]      layers.28.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[261/291]     layers.28.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[262/291]     layers.28.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[263/291]     layers.28.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[264/291]            layers.28.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[265/291]        layers.29.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[266/291]        layers.29.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[267/291]        layers.29.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[268/291]        layers.29.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[269/291]      layers.29.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[270/291]     layers.29.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[271/291]     layers.29.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
+            "[272/291]     layers.29.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[273/291]            layers.29.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[274/291]        layers.30.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[275/291]        layers.30.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[276/291]        layers.30.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n",
+            "[277/291]        layers.30.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[278/291]      layers.30.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[279/291]     layers.30.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[280/291]     layers.30.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.018 0.032 0.051 0.076 0.104 0.128 0.137 0.128 0.104 0.076 0.051 0.032 0.018 0.022 \n",
+            "[281/291]     layers.30.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[282/291]            layers.30.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[283/291]        layers.31.attention.wq.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
+            "[284/291]        layers.31.attention.wk.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n",
+            "[285/291]        layers.31.attention.wv.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n",
+            "[286/291]        layers.31.attention.wo.weight - [4096 x 4096], type =    f16, quantizing .. size =    32.00 MB ->    10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[287/291]      layers.31.attention_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "[288/291]     layers.31.feed_forward.w1.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[289/291]     layers.31.feed_forward.w2.weight - [11008 x 4096], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.021 0.018 0.031 0.050 0.075 0.104 0.130 0.140 0.130 0.104 0.075 0.050 0.031 0.018 0.021 \n",
+            "[290/291]     layers.31.feed_forward.w3.weight - [4096 x 11008], type =    f16, quantizing .. size =    86.00 MB ->    26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "[291/291]            layers.31.ffn_norm.weight - [4096], type =    f32, size =    0.016 MB\n",
+            "llama_model_quantize_internal: model size  = 13133.55 MB\n",
+            "llama_model_quantize_internal: quant size  =  4104.93 MB\n",
+            "llama_model_quantize_internal: hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n",
+            "\n",
+            "main: quantize time = 178732.41 ms\n",
+            "main:    total time = 178732.41 ms\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### （可选）测试量化模型解码\n",
+        "至此已完成了所有转换步骤。\n",
+        "我们运行一条命令测试一下是否能够正常加载并进行对话。\n",
+        "\n",
+        "FP16和Q4量化文件存放在./llama.cpp/zh-models/7B下，可按需下载使用。"
+      ],
+      "metadata": {
+        "id": "DLkuRAo9Vkb1"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q4_0.bin --color -f ./prompts/alpaca.txt -p \"详细介绍一下北京的名胜古迹：\" -n 512"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "tW-ep1BsVQtG",
+        "outputId": "0706c974-127e-4f21-be6b-d71ea4fb989b"
+      },
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "main: seed = 1681467955\n",
+            "llama.cpp: loading model from ./zh-models/7B/ggml-model-q4_0.bin\n",
+            "llama_model_load_internal: format     = ggjt v1 (latest)\n",
+            "llama_model_load_internal: n_vocab    = 49954\n",
+            "llama_model_load_internal: n_ctx      = 512\n",
+            "llama_model_load_internal: n_embd     = 4096\n",
+            "llama_model_load_internal: n_mult     = 256\n",
+            "llama_model_load_internal: n_head     = 32\n",
+            "llama_model_load_internal: n_layer    = 32\n",
+            "llama_model_load_internal: n_rot      = 128\n",
+            "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
+            "llama_model_load_internal: n_ff       = 11008\n",
+            "llama_model_load_internal: n_parts    = 1\n",
+            "llama_model_load_internal: model size = 7B\n",
+            "llama_model_load_internal: ggml ctx size =  59.11 KB\n",
+            "llama_model_load_internal: mem required  = 5896.99 MB (+ 1026.00 MB per state)\n",
+            "llama_init_from_file: kv self size  =  256.00 MB\n",
+            "\n",
+            "system_info: n_threads = 40 / 40 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
+            "sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.100000\n",
+            "generate: n_ctx = 512, n_batch = 8, n_predict = 512, n_keep = 0\n",
+            "\n",
+            "\n",
+            "\u001b[33m 详细介绍一下北京的名胜古迹：\u001b[0m\n",
+            " 故宫：明、清两代皇室，御花园及八达门大街。 宫殿内有大量文物珍品； [end of text]\n",
+            "\n",
+            "llama_print_timings:        load time =   717.01 ms\n",
+            "llama_print_timings:      sample time =    48.97 ms /    32 runs   (    1.53 ms per run)\n",
+            "llama_print_timings: prompt eval time =   680.93 ms /    11 tokens (   61.90 ms per token)\n",
+            "llama_print_timings:        eval time =  4490.00 ms /    31 runs   (  144.84 ms per run)\n",
+            "llama_print_timings:       total time =  5461.05 ms\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file

From a5ef187b9d3610245f8e2992dffa91bc6807775f Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Fri, 16 Jun 2023 00:06:10 +0800
Subject: [PATCH 4/9] add assertions

---
 .../merge_llama_with_chinese_lora_low_mem.py  | 52 +++++++++++--------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
index a770a2c..2291f79 100644
--- a/scripts/merge_llama_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -2,7 +2,7 @@
 Usage: 
 python merge_llama_with_chinese_lora_low_mem.py \
     --base_model path/to/llama/model \
-    --lora_model path/to/first/lora/model[,path/to/second/lora/model] \
+    --lora_model path/to/first/lora[,path/to/second/lora] \
     --output_type [pth|huggingface] \
     --output_dir path/to/output/dir
 """
@@ -12,19 +12,20 @@
 import gc
 import torch
 import peft
-from transformers import LlamaConfig, LlamaTokenizer
+from transformers import LlamaTokenizer
 from transformers.modeling_utils import dtype_byte_size
 from huggingface_hub import snapshot_download
 import re
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--base_model', default=None, required=True,
-                    type=str, help="Please specify a base_model")
+                    type=str, help="Please specify a base model.")
 parser.add_argument('--lora_model', default=None, required=True,
-                    type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models.")
+                    type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models")
 parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], type=str,
-                    help="save the merged model in pth or huggingface format.")
-parser.add_argument('--output_dir', default='./', type=str)
+                    help="Save the merged model in pth or huggingface format")
+parser.add_argument('--output_dir', default='./merged_model', type=str)
+parser.add_argument('--verbose', default=False, action='store_true', help="Show detailed messages")
 
 
 emb_to_model_size = {
@@ -121,7 +122,7 @@ def unpermute(w):
     )
 
 
-def save_shards(model_sd, num_shards: int, prefix=""):
+def save_shards(model_sd, num_shards: int, prefix="", verbose=False):
     # Add the no_grad context manager
     with torch.no_grad():
         if num_shards == 1:
@@ -144,11 +145,9 @@ def save_shards(model_sd, num_shards: int, prefix=""):
                 new_k = translate_state_dict_key(k)
                 if new_k is not None:
                     if new_k=='tok_embeddings.weight':
-                        print(f"Processing {new_k}")
                         assert v.size(1)%num_shards==0
                         splits = v.split(v.size(1)//num_shards,dim=1)
                     elif new_k=='output.weight':
-                        print(f"Processing {new_k}")
                         if v.size(0)%num_shards==0:
                             splits = v.split(v.size(0)//num_shards,dim=0)
                         else:
@@ -156,42 +155,35 @@ def save_shards(model_sd, num_shards: int, prefix=""):
                             size_list[-1] += v.size(0)%num_shards
                             splits = v.split(size_list, dim=0) # 13B: size_list == [24976,24977]
                     elif new_k=='norm.weight':
-                        print(f"Processing {new_k}")
                         splits = [v] * num_shards
                     elif 'ffn_norm.weight' in new_k:
-                        print(f"Processing {new_k}")
                         splits = [v] * num_shards
                     elif 'attention_norm.weight' in new_k:
-                        print(f"Processing {new_k}")
                         splits = [v] * num_shards
 
 
                     elif 'w1.weight' in new_k:
-                        print(f"Processing {new_k}")
                         splits = v.split(v.size(0)//num_shards,dim=0)
                     elif 'w2.weight' in new_k:
-                        print(f"Processing {new_k}")
                         splits = v.split(v.size(1)//num_shards,dim=1)
                     elif 'w3.weight' in new_k:
-                        print(f"Processing {new_k}")
                         splits = v.split(v.size(0)//num_shards,dim=0)
 
 
                     elif 'wo.weight' in new_k:
-                        print(f"Processing {new_k}")
                         splits = v.split(v.size(1)//num_shards,dim=1)
 
                     elif 'wv.weight' in new_k:
-                        print(f"Processing {new_k}")
                         splits = v.split(v.size(0)//num_shards,dim=0)
 
                     elif "wq.weight" in new_k or "wk.weight" in new_k:
-                        print(f"Processing {new_k}")
                         v = unpermute(v)
                         splits = v.split(v.size(0)//num_shards,dim=0)
                     else:
                         print(f"Unexpected key {new_k}")
                         raise ValueError
+                    if verbose:
+                        print(f"Processing {new_k}")
                     for sd,split in zip(new_state_dicts,splits):
                         sd[new_k] = split.clone()
                         del split
@@ -248,6 +240,12 @@ def merge_shards(output_dir, num_shards: int):
         tokenizer = LlamaTokenizer.from_pretrained(lora_model_path)
         lora_config = peft.LoraConfig.from_pretrained(lora_model_path)
         lora_state_dict = torch.load(os.path.join(lora_model_path,'adapter_model.bin'),map_location='cpu')
+        if 'base_model.model.model.embed_tokens.weight' in lora_state_dict:
+            lora_vocab_size = lora_state_dict['base_model.model.model.embed_tokens.weight'].shape[0]
+            assert lora_vocab_size==len(tokenizer), \
+            (f"The vocab size of the tokenizer {len(tokenizer)} does not match the vocab size of the LoRA weight {lora_vocab_size}.\n"
+            "Did you misuse the LLaMA tokenizer with the Alpaca-LoRA weight?\n"
+            "Make sure that you use LLaMA tokenizer with the LLaMA-LoRA weight and Alpaca tokenizer with the Alpaca-LoRA weight!")
         tokenizers_and_loras.append(
             {
                 "tokenizer"  :tokenizer,
@@ -256,6 +254,13 @@ def merge_shards(output_dir, num_shards: int):
                 "scaling": lora_config.lora_alpha / lora_config.r,
                 "fan_in_fan_out" : lora_config.fan_in_fan_out,
             })
+    if len(tokenizers_and_loras)==2:
+        t1_vocab_size = len(tokenizers_and_loras[0]["tokenizer"])
+        t2_vocab_size = len(tokenizers_and_loras[1]["tokenizer"])
+        assert t1_vocab_size<=t2_vocab_size, \
+        (f"The vocab size of the first tokenizer is {t1_vocab_size}\n"
+         f"The vocab size of the second tokenizer is {t2_vocab_size}, found to be smaller than {t1_vocab_size}\n"
+        "This is not the intended use. Please check your model and tokenizer.")
 
     if not os.path.exists(base_model_path):
         print("Cannot find lora model on the disk. Downloading lora model from hub...")
@@ -282,16 +287,19 @@ def merge_shards(output_dir, num_shards: int):
                 dims_per_head = dim // n_heads
                 base = 10000.0
                 inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+        print("Merging...")
         for k in state_dict:
             for ti, tandl in enumerate(tokenizers_and_loras):
                 saved_key = 'base_model.model.'+k
                 lora_key_A = saved_key.replace('.weight','.lora_A.weight')
                 if saved_key in tandl['state_dict']:
-                    print(f"copying {saved_key} from {ti}-th LoRA weight to {k}")
+                    if args.verbose:
+                        print(f"copying {saved_key} from {ti}-th LoRA weight to {k}")
                     state_dict[k] = tandl['state_dict'][saved_key].half().clone() # do we need half()?
                 if lora_key_A in tandl['state_dict']:
                     lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight')
-                    print(f"merging {lora_key_A} and lora_B.weight form {ti}-th LoRA weight to {k}")
+                    if args.verbose:
+                        print(f"merging {lora_key_A} and lora_B.weight form {ti}-th LoRA weight to {k}")
                     state_dict[k] += (
                         transpose(
                             tandl['state_dict'][lora_key_B].float() 
@@ -308,8 +316,8 @@ def merge_shards(output_dir, num_shards: int):
             print(f"Saving ckpt {filename} to {output_dir} in HF format...")
             torch.save(state_dict,os.path.join(output_dir, filename))
         elif output_type=='pth':
-            print(f"Saving ckpt {filename} to {output_dir} in pth format...")
-            save_shards(model_sd=state_dict, num_shards=num_shards,prefix=f"L{index+1}-")
+            print(f"Converting to pth format...")
+            save_shards(model_sd=state_dict, num_shards=num_shards,prefix=f"L{index+1}-", verbose=args.verbose)
         del state_dict
         gc.collect()    # Effectively enforce garbage collection
 

From 8c7322c7779450e5eaab9afc2e183d9b91d06c5b Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Fri, 16 Jun 2023 00:11:39 +0800
Subject: [PATCH 5/9] remove comments

---
 scripts/merge_llama_with_chinese_lora_low_mem.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
index 2291f79..be48a01 100644
--- a/scripts/merge_llama_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -308,10 +308,6 @@ def merge_shards(output_dir, num_shards: int):
             weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype)
             total_size += weight_size
 
-        # did we do anything?
-        # assert not torch.allclose(first_weight_old, first_weight)
-        # first_weight = base_model.model.layers[0].self_attn.q_proj.weight
-        # first_weight_old = first_weight.clone()
         if output_type=='huggingface':
             print(f"Saving ckpt {filename} to {output_dir} in HF format...")
             torch.save(state_dict,os.path.join(output_dir, filename))

From 2ee6100a624eb647b3a7cbbea73de48d3b2257c3 Mon Sep 17 00:00:00 2001
From: yaoxin <35353688+iMountTai@users.noreply.github.com>
Date: Fri, 16 Jun 2023 10:10:02 +0800
Subject: [PATCH 6/9] Update merge_llama_with_chinese_lora_low_mem.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

报错信息容易引起混淆，可删除
---
 scripts/merge_llama_with_chinese_lora_low_mem.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
index be48a01..e6ab3aa 100644
--- a/scripts/merge_llama_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -244,7 +244,6 @@ def merge_shards(output_dir, num_shards: int):
             lora_vocab_size = lora_state_dict['base_model.model.model.embed_tokens.weight'].shape[0]
             assert lora_vocab_size==len(tokenizer), \
             (f"The vocab size of the tokenizer {len(tokenizer)} does not match the vocab size of the LoRA weight {lora_vocab_size}.\n"
-            "Did you misuse the LLaMA tokenizer with the Alpaca-LoRA weight?\n"
             "Make sure that you use LLaMA tokenizer with the LLaMA-LoRA weight and Alpaca tokenizer with the Alpaca-LoRA weight!")
         tokenizers_and_loras.append(
             {
@@ -335,4 +334,4 @@ def merge_shards(output_dir, num_shards: int):
                 if config=='pytorch_model.bin.index.json':
                     obj['metadata']['total_size'] = total_size
                 json.dump(obj, open(os.path.join(output_dir, config),'w'), indent=2)
-    print("Done.")
\ No newline at end of file
+    print("Done.")

From d2086171d3823b53b8f0948ab7e256ddd9b180c9 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Fri, 16 Jun 2023 10:58:08 +0800
Subject: [PATCH 7/9] improve naming

---
 scripts/merge_llama_with_chinese_lora_low_mem.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
index e6ab3aa..166968b 100644
--- a/scripts/merge_llama_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -123,7 +123,9 @@ def unpermute(w):
 
 
 def save_shards(model_sd, num_shards: int, prefix="", verbose=False):
-    # Add the no_grad context manager
+    """
+    Convert and save the HF format weights to PTH format weights
+    """
     with torch.no_grad():
         if num_shards == 1:
             new_state_dict = {}
@@ -288,17 +290,17 @@ def merge_shards(output_dir, num_shards: int):
                 inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
         print("Merging...")
         for k in state_dict:
-            for ti, tandl in enumerate(tokenizers_and_loras):
+            for tl_idx, tandl in enumerate(tokenizers_and_loras):
                 saved_key = 'base_model.model.'+k
                 lora_key_A = saved_key.replace('.weight','.lora_A.weight')
                 if saved_key in tandl['state_dict']:
                     if args.verbose:
-                        print(f"copying {saved_key} from {ti}-th LoRA weight to {k}")
+                        print(f"copying {saved_key} from {tl_idx}-th LoRA weight to {k}")
                     state_dict[k] = tandl['state_dict'][saved_key].half().clone() # do we need half()?
                 if lora_key_A in tandl['state_dict']:
                     lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight')
                     if args.verbose:
-                        print(f"merging {lora_key_A} and lora_B.weight form {ti}-th LoRA weight to {k}")
+                        print(f"merging {lora_key_A} and lora_B.weight form {tl_idx}-th LoRA weight to {k}")
                     state_dict[k] += (
                         transpose(
                             tandl['state_dict'][lora_key_B].float() 
@@ -330,8 +332,10 @@ def merge_shards(output_dir, num_shards: int):
         for config in configs:
             if os.path.exists(os.path.join(base_model_path, config)):
                 print(f"Saving {config}")
-                obj = json.load(open(os.path.join(base_model_path, config)))
+                with open(os.path.join(base_model_path, config),'r') as f:
+                    obj = json.load(f)
                 if config=='pytorch_model.bin.index.json':
                     obj['metadata']['total_size'] = total_size
-                json.dump(obj, open(os.path.join(output_dir, config),'w'), indent=2)
+                with open(os.path.join(output_dir, config), 'w') as f:
+                    json.dump(obj, f, indent=2)
     print("Done.")

From c43f34b769d43698efccd5e2b398c2353fc51b81 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Fri, 16 Jun 2023 11:03:10 +0800
Subject: [PATCH 8/9] update help info

---
 .../merge_llama_with_chinese_lora_low_mem.py  | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
index 166968b..87fbf71 100644
--- a/scripts/merge_llama_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -19,13 +19,15 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--base_model', default=None, required=True,
-                    type=str, help="Please specify a base model.")
+                    type=str, help="Please specify a base model")
 parser.add_argument('--lora_model', default=None, required=True,
                     type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models")
-parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], type=str,
-                    help="Save the merged model in pth or huggingface format")
-parser.add_argument('--output_dir', default='./merged_model', type=str)
-parser.add_argument('--verbose', default=False, action='store_true', help="Show detailed messages")
+parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], 
+                    type=str, help="Save the merged model in pth or huggingface format")
+parser.add_argument('--output_dir', default='./merged_model',
+                    type=str, help="The output folder where we save the merged mdoel")
+parser.add_argument('--verbose', default=False, action='store_true',
+                    help="Show detailed messages")
 
 
 emb_to_model_size = {
@@ -290,21 +292,21 @@ def merge_shards(output_dir, num_shards: int):
                 inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
         print("Merging...")
         for k in state_dict:
-            for tl_idx, tandl in enumerate(tokenizers_and_loras):
+            for tl_idx, t_and_l in enumerate(tokenizers_and_loras):
                 saved_key = 'base_model.model.'+k
                 lora_key_A = saved_key.replace('.weight','.lora_A.weight')
-                if saved_key in tandl['state_dict']:
+                if saved_key in t_and_l['state_dict']:
                     if args.verbose:
                         print(f"copying {saved_key} from {tl_idx}-th LoRA weight to {k}")
-                    state_dict[k] = tandl['state_dict'][saved_key].half().clone() # do we need half()?
-                if lora_key_A in tandl['state_dict']:
+                    state_dict[k] = t_and_l['state_dict'][saved_key].half().clone() # do we need half()?
+                if lora_key_A in t_and_l['state_dict']:
                     lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight')
                     if args.verbose:
                         print(f"merging {lora_key_A} and lora_B.weight form {tl_idx}-th LoRA weight to {k}")
                     state_dict[k] += (
                         transpose(
-                            tandl['state_dict'][lora_key_B].float() 
-                          @ tandl['state_dict'][lora_key_A].float(), tandl['fan_in_fan_out']) * tandl['scaling']
+                            t_and_l['state_dict'][lora_key_B].float() 
+                          @ t_and_l['state_dict'][lora_key_A].float(), t_and_l['fan_in_fan_out']) * t_and_l['scaling']
                     )
             weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype)
             total_size += weight_size

From 25dba18cae809ab4aec8cb483c9ec75820e4e96c Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Fri, 16 Jun 2023 11:04:20 +0800
Subject: [PATCH 9/9] Update merge_llama_with_chinese_lora_low_mem.py

---
 scripts/merge_llama_with_chinese_lora_low_mem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
index 87fbf71..4c6b76c 100644
--- a/scripts/merge_llama_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -25,7 +25,7 @@
 parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], 
                     type=str, help="Save the merged model in pth or huggingface format")
 parser.add_argument('--output_dir', default='./merged_model',
-                    type=str, help="The output folder where we save the merged mdoel")
+                    type=str, help="The output folder to save the merged model")
 parser.add_argument('--verbose', default=False, action='store_true',
                     help="Show detailed messages")