From f217f55ae3bfa52592e7a24b338a1d55e6dc4771 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Thu, 15 Jun 2023 17:13:17 +0800 Subject: [PATCH 1/9] add low-mem-merge script --- .../merge_llama_with_chinese_lora_low_mem.py | 334 ++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 scripts/merge_llama_with_chinese_lora_low_mem.py diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py new file mode 100644 index 0000000..a770a2c --- /dev/null +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -0,0 +1,334 @@ +""" +Usage: +python merge_llama_with_chinese_lora_low_mem.py \ + --base_model path/to/llama/model \ + --lora_model path/to/first/lora/model[,path/to/second/lora/model] \ + --output_type [pth|huggingface] \ + --output_dir path/to/output/dir +""" +import argparse +import json +import os +import gc +import torch +import peft +from transformers import LlamaConfig, LlamaTokenizer +from transformers.modeling_utils import dtype_byte_size +from huggingface_hub import snapshot_download +import re + +parser = argparse.ArgumentParser() +parser.add_argument('--base_model', default=None, required=True, + type=str, help="Please specify a base_model") +parser.add_argument('--lora_model', default=None, required=True, + type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models.") +parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], type=str, + help="save the merged model in pth or huggingface format.") +parser.add_argument('--output_dir', default='./', type=str) + + +emb_to_model_size = { + 4096 : '7B', + 5120 : '13B', + 6656 : '33B', + 8192 : '65B', +} +num_shards_of_models = {'7B': 1, '13B': 2, '33B': 4, '65B': 8} +params_of_models = { + '7B': + { + "dim": 4096, + "multiple_of": 256, + "n_heads": 32, + "n_layers": 32, + "norm_eps": 1e-06, + "vocab_size": -1, + }, + '13B': + { + "dim": 5120, + "multiple_of": 256, + "n_heads": 40, + "n_layers": 40, + "norm_eps": 1e-06, + "vocab_size": -1, + }, + '33B': + { + "dim": 6656, + "multiple_of": 256, + "n_heads": 52, + "n_layers": 60, + "norm_eps": 1e-06, + "vocab_size": -1, + }, + '65B': + { + "dim": 8192, + "multiple_of": 256, + "n_heads": 64, + "n_layers": 80, + "norm_eps": 1e-05, + "vocab_size": -1, + }, +} + +def transpose(weight, fan_in_fan_out): + return weight.T if fan_in_fan_out else weight + +# Borrowed and modified from https://github.com/tloen/alpaca-lora +def translate_state_dict_key(k): + k = k.replace("base_model.model.", "") + if k == "model.embed_tokens.weight": + return "tok_embeddings.weight" + elif k == "model.norm.weight": + return "norm.weight" + elif k == "lm_head.weight": + return "output.weight" + elif k.startswith("model.layers."): + layer = k.split(".")[2] + if k.endswith(".self_attn.q_proj.weight"): + return f"layers.{layer}.attention.wq.weight" + elif k.endswith(".self_attn.k_proj.weight"): + return f"layers.{layer}.attention.wk.weight" + elif k.endswith(".self_attn.v_proj.weight"): + return f"layers.{layer}.attention.wv.weight" + elif k.endswith(".self_attn.o_proj.weight"): + return f"layers.{layer}.attention.wo.weight" + elif k.endswith(".mlp.gate_proj.weight"): + return f"layers.{layer}.feed_forward.w1.weight" + elif k.endswith(".mlp.down_proj.weight"): + return f"layers.{layer}.feed_forward.w2.weight" + elif k.endswith(".mlp.up_proj.weight"): + return f"layers.{layer}.feed_forward.w3.weight" + elif k.endswith(".input_layernorm.weight"): + return f"layers.{layer}.attention_norm.weight" + elif k.endswith(".post_attention_layernorm.weight"): + return f"layers.{layer}.ffn_norm.weight" + elif k.endswith("rotary_emb.inv_freq") or "lora" in k: + return None + else: + print(layer, k) + raise NotImplementedError + else: + print(k) + raise NotImplementedError + + +def unpermute(w): + return ( + w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim) + ) + + +def save_shards(model_sd, num_shards: int, prefix=""): + # Add the no_grad context manager + with torch.no_grad(): + if num_shards == 1: + new_state_dict = {} + for k, v in model_sd.items(): + new_k = translate_state_dict_key(k) + if new_k is not None: + if "wq" in new_k or "wk" in new_k: + new_state_dict[new_k] = unpermute(v) + else: + new_state_dict[new_k] = v + + os.makedirs(output_dir, exist_ok=True) + print(f"Saving shard 1 of {num_shards} into {output_dir}/{prefix}consolidated.00.pth") + torch.save(new_state_dict, output_dir + f"/{prefix}consolidated.00.pth") + else: + new_state_dicts = [dict() for _ in range(num_shards)] + for k in list(model_sd.keys()): + v = model_sd[k] + new_k = translate_state_dict_key(k) + if new_k is not None: + if new_k=='tok_embeddings.weight': + print(f"Processing {new_k}") + assert v.size(1)%num_shards==0 + splits = v.split(v.size(1)//num_shards,dim=1) + elif new_k=='output.weight': + print(f"Processing {new_k}") + if v.size(0)%num_shards==0: + splits = v.split(v.size(0)//num_shards,dim=0) + else: + size_list = [v.size(0)//num_shards] * num_shards + size_list[-1] += v.size(0)%num_shards + splits = v.split(size_list, dim=0) # 13B: size_list == [24976,24977] + elif new_k=='norm.weight': + print(f"Processing {new_k}") + splits = [v] * num_shards + elif 'ffn_norm.weight' in new_k: + print(f"Processing {new_k}") + splits = [v] * num_shards + elif 'attention_norm.weight' in new_k: + print(f"Processing {new_k}") + splits = [v] * num_shards + + + elif 'w1.weight' in new_k: + print(f"Processing {new_k}") + splits = v.split(v.size(0)//num_shards,dim=0) + elif 'w2.weight' in new_k: + print(f"Processing {new_k}") + splits = v.split(v.size(1)//num_shards,dim=1) + elif 'w3.weight' in new_k: + print(f"Processing {new_k}") + splits = v.split(v.size(0)//num_shards,dim=0) + + + elif 'wo.weight' in new_k: + print(f"Processing {new_k}") + splits = v.split(v.size(1)//num_shards,dim=1) + + elif 'wv.weight' in new_k: + print(f"Processing {new_k}") + splits = v.split(v.size(0)//num_shards,dim=0) + + elif "wq.weight" in new_k or "wk.weight" in new_k: + print(f"Processing {new_k}") + v = unpermute(v) + splits = v.split(v.size(0)//num_shards,dim=0) + else: + print(f"Unexpected key {new_k}") + raise ValueError + for sd,split in zip(new_state_dicts,splits): + sd[new_k] = split.clone() + del split + del splits + del model_sd[k],v + gc.collect() # Effectively enforce garbage collection + + os.makedirs(output_dir, exist_ok=True) + for i,new_state_dict in enumerate(new_state_dicts): + print(f"Saving shard {i+1} of {num_shards} into {output_dir}/{prefix}consolidated.0{i}.pth") + torch.save(new_state_dict, output_dir + f"/{prefix}consolidated.0{i}.pth") + +def merge_shards(output_dir, num_shards: int): + ckpt_filenames = sorted([f for f in os.listdir(output_dir) if re.match('L(\d+)-consolidated.(\d+).pth',f)]) + + for i in range(num_shards): + shards_filenames = sorted([f for f in ckpt_filenames if re.match(f'L(\d+)-consolidated.0{i}.pth',f)]) + print(f"Loading {shards_filenames} ...") + shards_dicts = [torch.load(os.path.join(output_dir,fn)) for fn in shards_filenames] + shards_merged = {} + for d in shards_dicts: + shards_merged |= d + + print(f"Saving the merged shard to " + os.path.join(output_dir, f"consolidated.0{i}.pth")) + torch.save(shards_merged, os.path.join(output_dir, f"consolidated.0{i}.pth")) + + print("Cleaning up...") + del shards_merged + for d in shards_dicts: + del d + del shards_dicts + gc.collect() # Effectively enforce garbage collection + for fn in shards_filenames: + os.remove(os.path.join(output_dir,fn)) + +if __name__=='__main__': + + args = parser.parse_args() + base_model_path = args.base_model + lora_model_paths = [s.strip() for s in args.lora_model.split(',') if len(s.strip())!=0] + output_dir = args.output_dir + output_type = args.output_type + os.makedirs(output_dir, exist_ok=True) + + print(f"Base model: {base_model_path}") + print(f"LoRA model(s) {lora_model_paths}:") + + tokenizers_and_loras = [] + for lora_model_path in lora_model_paths: + print(f"Loading {lora_model_path}") + if not os.path.exists(lora_model_path): + print("Cannot find lora model on the disk. Downloading lora model from hub...") + lora_model_path = snapshot_download(repo_id=lora_model_path) + tokenizer = LlamaTokenizer.from_pretrained(lora_model_path) + lora_config = peft.LoraConfig.from_pretrained(lora_model_path) + lora_state_dict = torch.load(os.path.join(lora_model_path,'adapter_model.bin'),map_location='cpu') + tokenizers_and_loras.append( + { + "tokenizer" :tokenizer, + "state_dict" :lora_state_dict, + "config": lora_config, + "scaling": lora_config.lora_alpha / lora_config.r, + "fan_in_fan_out" : lora_config.fan_in_fan_out, + }) + + if not os.path.exists(base_model_path): + print("Cannot find lora model on the disk. Downloading lora model from hub...") + base_model_path = snapshot_download(repo_id=base_model_path) + ckpt_filenames = sorted([f for f in os.listdir(base_model_path) if re.match('pytorch_model-(\d+)-of-(\d+).bin',f)]) + + embedding_size = None + model_size = None + + + total_size = 0 + for index, filename in enumerate(ckpt_filenames): + print(f"Loading ckpt {filename}") + state_dict = torch.load(os.path.join(base_model_path,filename), map_location='cpu') + if index == 0: + embedding_size = state_dict['model.embed_tokens.weight'].shape[1] + model_size = emb_to_model_size[embedding_size] + if output_type=='pth': + params = params_of_models[model_size] + num_shards = num_shards_of_models[model_size] + n_layers = params["n_layers"] + n_heads = params["n_heads"] + dim = params["dim"] + dims_per_head = dim // n_heads + base = 10000.0 + inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) + for k in state_dict: + for ti, tandl in enumerate(tokenizers_and_loras): + saved_key = 'base_model.model.'+k + lora_key_A = saved_key.replace('.weight','.lora_A.weight') + if saved_key in tandl['state_dict']: + print(f"copying {saved_key} from {ti}-th LoRA weight to {k}") + state_dict[k] = tandl['state_dict'][saved_key].half().clone() # do we need half()? + if lora_key_A in tandl['state_dict']: + lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight') + print(f"merging {lora_key_A} and lora_B.weight form {ti}-th LoRA weight to {k}") + state_dict[k] += ( + transpose( + tandl['state_dict'][lora_key_B].float() + @ tandl['state_dict'][lora_key_A].float(), tandl['fan_in_fan_out']) * tandl['scaling'] + ) + weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype) + total_size += weight_size + + # did we do anything? + # assert not torch.allclose(first_weight_old, first_weight) + # first_weight = base_model.model.layers[0].self_attn.q_proj.weight + # first_weight_old = first_weight.clone() + if output_type=='huggingface': + print(f"Saving ckpt {filename} to {output_dir} in HF format...") + torch.save(state_dict,os.path.join(output_dir, filename)) + elif output_type=='pth': + print(f"Saving ckpt {filename} to {output_dir} in pth format...") + save_shards(model_sd=state_dict, num_shards=num_shards,prefix=f"L{index+1}-") + del state_dict + gc.collect() # Effectively enforce garbage collection + + + print(f"Saving tokenizer") + tokenizers_and_loras[-1]['tokenizer'].save_pretrained(output_dir) + if output_type == 'pth': + with open(output_dir + "/params.json", "w") as f: + print(f"Saving params.json into {output_dir}/params.json") + json.dump(params, f) + merge_shards(output_dir, num_shards=num_shards) + + if output_type=='huggingface': + configs = ('config.json', 'generation_config.json', 'pytorch_model.bin.index.json') + for config in configs: + if os.path.exists(os.path.join(base_model_path, config)): + print(f"Saving {config}") + obj = json.load(open(os.path.join(base_model_path, config))) + if config=='pytorch_model.bin.index.json': + obj['metadata']['total_size'] = total_size + json.dump(obj, open(os.path.join(output_dir, config),'w'), indent=2) + print("Done.") \ No newline at end of file From ddfe5b80323c3740548039ab2475efcc669b2c63 Mon Sep 17 00:00:00 2001 From: ymcui Date: Thu, 15 Jun 2023 17:17:44 +0800 Subject: [PATCH 2/9] update new conversion notebook (low-mem) --- notebooks/README.md | 22 +- ...ert_and_quantize_chinese_alpaca_plus.ipynb | 1171 ---------- .../convert_and_quantize_chinese_llama.ipynb | 1874 ----------------- 3 files changed, 10 insertions(+), 3057 deletions(-) delete mode 100644 notebooks/convert_and_quantize_chinese_alpaca_plus.ipynb delete mode 100644 notebooks/convert_and_quantize_chinese_llama.ipynb diff --git a/notebooks/README.md b/notebooks/README.md index 149f3a6..27b4486 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -1,20 +1,12 @@ # 笔记本示例 Notebooks -### convert_and_quantize_chinese_llama.ipynb +### convert_and_quantize_chinese_llama_and_alpaca.ipynb -Colab上的转换和量化中文LLaMA/Alpaca的运行示例(仅供流程参考)。 +Colab上的转换和量化中文LLaMA/Alpaca(含Plus版本)的运行示例(仅供流程参考)。 Example of conversion and quantization for Chinese LLaMA/Alpaca. -建议查看Colab上的最新版 / Check latest notebook:Open In Colab - -### convert_and_quantize_chinese_alpaca_plus.ipynb - -Colab上的转换和量化中文Alpaca-Plus的运行示例(仅供流程参考)。 - -Example of conversion and quantization for Chinese Alpaca-Plus. - -建议查看Colab上的最新版 / Check latest notebook:Open In Colab +建议查看Colab上的最新版 / Check latest notebook:Open In Colab ### pretrain_chinese_llama_lora.ipynb @@ -38,4 +30,10 @@ Colab上的Gradio演示示例。 Example of running the Gradio demo on Colab. -在Colab中打开 / Open the notebook in Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ymcui/Chinese-LLaMA-Alpaca/blob/main/notebooks/gradio_web_demo.ipynb) \ No newline at end of file +在Colab中打开 / Open the notebook in Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ymcui/Chinese-LLaMA-Alpaca/blob/main/notebooks/gradio_web_demo.ipynb) + +### legacy/ + +旧版notebook,供参考,但不会再更新。 + +Old notebook. Reference only, will not be updated. \ No newline at end of file diff --git a/notebooks/convert_and_quantize_chinese_alpaca_plus.ipynb b/notebooks/convert_and_quantize_chinese_alpaca_plus.ipynb deleted file mode 100644 index b3bf1e3..0000000 --- a/notebooks/convert_and_quantize_chinese_alpaca_plus.ipynb +++ /dev/null @@ -1,1171 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "B1c96_k3MahN" - }, - "source": [ - "# 转换并量化中文Alpaca Plus模型\n", - "\n", - "关于其他模型请参考另一个notebook:https://colab.research.google.com/drive/1Eak6azD3MLeb-YsfbP8UZC8wrL1ddIMI?usp=sharing\n", - "\n", - "\n", - "🎉🎉🎉 **新:现在免费用户也有机会能够转换7B和13B模型了!**\n", - "\n", - "💡 提示和小窍门:\n", - "- 免费用户默认的内存只有12G左右,**笔者用免费账号实测选择TPU的话有机会随机出35G内存**,建议多试几次。如果能随机出25G内存以上的机器就可以了转换7B模型了,35G内存以上机器就能转换13B模型了\n", - "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n", - "- 实测:转换7B级别模型,25G内存的机器就够了;转换13B级别模型需要30G以上的内存(程序莫名崩掉或断开连接就说明内存爆了)\n", - "- 如果选了“高RAM”之后内存还是不够大的话,选择以下操作,有的时候会分配出很高内存的机器,祝你好运😄!\n", - " - 可以把GPU或者TPU也选上(虽然不会用到)\n", - " - 选GPU时,Pro用户可选“高级”类型GPU\n", - "\n", - "以下信息配置信息供参考(Pro订阅下测试),运行时规格设置为“高RAM”时的设备配置如下(有随机性):\n", - "\n", - "| 硬件加速器 | RAM | 硬盘 |\n", - "| :-- | :--: | :--: |\n", - "| None | 25GB | 225GB |\n", - "| TPU | 35GB | 225GB |\n", - "| GPU(标准,T4)| 25GB | 166GB |\n", - "| GPU(高性能,V100)| 25GB | 166GB |\n", - "| GPU(高性能,A100)| **80GB** | 166GB |\n", - "\n", - "*温馨提示:用完之后注意断开运行时,选择满足要求的最低配置即可,避免不必要的计算单元消耗(Pro只给100个计算单元)。*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vScqHD_jMFOV" - }, - "source": [ - "## 安装相关依赖" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "E5WKFJXIL6ZU", - "outputId": "87a89bed-053e-4e61-e2f8-1dfcbdf87fbf" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting torch==1.12.0\n", - " Downloading torch-1.12.0-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m776.3/776.3 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==1.12.0) (4.5.0)\n", - "Installing collected packages: torch\n", - " Attempting uninstall: torch\n", - " Found existing installation: torch 2.0.0+cu118\n", - " Uninstalling torch-2.0.0+cu118:\n", - " Successfully uninstalled torch-2.0.0+cu118\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "torchvision 0.15.1+cu118 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n", - "torchtext 0.15.1 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n", - "torchdata 0.6.0 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n", - "torchaudio 2.0.1+cu118 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n", - "peft 0.2.0 requires torch>=1.13.0, but you have torch 1.12.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed torch-1.12.0\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.28.1)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", - "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", - "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (2023.4.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting git+https://github.com/huggingface/peft\n", - " Cloning https://github.com/huggingface/peft to /tmp/pip-req-build-tnxzt7q0\n", - " Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft /tmp/pip-req-build-tnxzt7q0\n", - " Resolved https://github.com/huggingface/peft to commit 632997d1fb776c3cf05d8c2537ac9a98a7ce9435\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (23.1)\n", - "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (0.18.0)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (1.22.4)\n", - "Collecting torch>=1.13.0\n", - " Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m619.9/619.9 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (6.0)\n", - "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (5.9.5)\n", - "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (4.28.1)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1)\n", - "Collecting nvidia-cufft-cu11==10.9.0.58\n", - " Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.4/168.4 MB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cudnn-cu11==8.5.0.96\n", - " Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m557.1/557.1 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (2.0.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.12.0)\n", - "Collecting nvidia-cuda-runtime-cu11==11.7.99\n", - " Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m849.3/849.3 kB\u001b[0m \u001b[31m48.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1.2)\n", - "Collecting nvidia-nccl-cu11==2.14.3\n", - " Downloading nvidia_nccl_cu11-2.14.3-py3-none-manylinux1_x86_64.whl (177.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.1/177.1 MB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (1.11.1)\n", - "Collecting nvidia-cusparse-cu11==11.7.4.91\n", - " Downloading nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m173.2/173.2 MB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cublas-cu11==11.10.3.66\n", - " Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.1/317.1 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-nvtx-cu11==11.7.91\n", - " Downloading nvidia_nvtx_cu11-11.7.91-py3-none-manylinux1_x86_64.whl (98 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.6/98.6 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (4.5.0)\n", - "Collecting nvidia-curand-cu11==10.2.10.91\n", - " Downloading nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.6/54.6 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cusolver-cu11==11.4.0.1\n", - " Downloading nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.6/102.6 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99\n", - " Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.0/21.0 MB\u001b[0m \u001b[31m63.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101\n", - " Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.8/11.8 MB\u001b[0m \u001b[31m75.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0.dev0) (0.40.0)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0.dev0) (67.7.2)\n", - "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (3.25.2)\n", - "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (16.0.2)\n", - "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.14.1)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2022.10.31)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.13.3)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (4.65.0)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2.27.1)\n", - "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers->peft==0.3.0.dev0) (2023.4.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.13.0->peft==0.3.0.dev0) (2.1.2)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2.0.12)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2022.12.7)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (3.4)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (1.26.15)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.13.0->peft==0.3.0.dev0) (1.3.0)\n", - "Building wheels for collected packages: peft\n", - " Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for peft: filename=peft-0.3.0.dev0-py3-none-any.whl size=55537 sha256=3cc2a65c09926ac217ac671b7d9c1640eac9857f0aca55b78a9fcda484263073\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-1rjlvx70/wheels/4c/16/67/1002a2d4daa822eff130e6d85b90051b75d2ce0d26b9448e4a\n", - "Successfully built peft\n", - "Installing collected packages: nvidia-nvtx-cu11, nvidia-nccl-cu11, nvidia-cusparse-cu11, nvidia-curand-cu11, nvidia-cufft-cu11, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-cupti-cu11, nvidia-cublas-cu11, nvidia-cusolver-cu11, nvidia-cudnn-cu11, torch, peft\n", - " Attempting uninstall: torch\n", - " Found existing installation: torch 1.12.0\n", - " Uninstalling torch-1.12.0:\n", - " Successfully uninstalled torch-1.12.0\n", - " Attempting uninstall: peft\n", - " Found existing installation: peft 0.2.0\n", - " Uninstalling peft-0.2.0:\n", - " Successfully uninstalled peft-0.2.0\n", - "Successfully installed nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-cupti-cu11-11.7.101 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.2.10.91 nvidia-cusolver-cu11-11.4.0.1 nvidia-cusparse-cu11-11.7.4.91 nvidia-nccl-cu11-2.14.3 nvidia-nvtx-cu11-11.7.91 peft-0.3.0.dev0 torch-2.0.0\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.98)\n" - ] - } - ], - "source": [ - "!pip install torch==1.12.0\n", - "!pip install transformers\n", - "!pip install git+https://github.com/huggingface/peft\n", - "!pip install sentencepiece" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ygb1xFIMNQKw" - }, - "source": [ - "## 克隆目录和代码" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "yCEJh7NJNXz9", - "outputId": "ec16f31b-7af7-4eb8-82ce-5f9317bad941" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Cloning into 'Chinese-LLaMA-Alpaca'...\n", - "remote: Enumerating objects: 761, done.\u001b[K\n", - "remote: Counting objects: 100% (202/202), done.\u001b[K\n", - "remote: Compressing objects: 100% (172/172), done.\u001b[K\n", - "remote: Total 761 (delta 54), reused 69 (delta 29), pack-reused 559\u001b[K\n", - "Receiving objects: 100% (761/761), 11.16 MiB | 22.49 MiB/s, done.\n", - "Resolving deltas: 100% (444/444), done.\n", - "Cloning into 'llama.cpp'...\n", - "remote: Enumerating objects: 2086, done.\u001b[K\n", - "remote: Counting objects: 100% (842/842), done.\u001b[K\n", - "remote: Compressing objects: 100% (99/99), done.\u001b[K\n", - "remote: Total 2086 (delta 778), reused 756 (delta 743), pack-reused 1244\u001b[K\n", - "Receiving objects: 100% (2086/2086), 2.12 MiB | 16.33 MiB/s, done.\n", - "Resolving deltas: 100% (1345/1345), done.\n" - ] - } - ], - "source": [ - "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n", - "!git clone https://github.com/ggerganov/llama.cpp" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nIyxX0DSNsgQ" - }, - "source": [ - "## 合并模型(Alpaca-Plus-7B)\n", - "\n", - "**⚠️ 再次提醒:7B模型需要25G内存,13B模型需要35G+内存。**\n", - "\n", - "此处使用的是🤗模型库中提供的基模型(已是HF格式),而不是Facebook官方的LLaMA模型,因此略去将原版LLaMA转换为HF格式的步骤。\n", - "\n", - "**这里直接运行第二步:合并LoRA权重**,生成全量模型权重。可以直接指定🤗模型库的地址,也可以是本地存放地址。\n", - "- 基模型:`decapoda-research/llama-7b-hf` *(use at your own risk)*\n", - "- LoRA模型:先写`ziqingyang/chinese-llama-plus-lora-7b`然后再写`ziqingyang/chinese-alpaca-plus-lora-7b`\n", - "- 输出类型:因为后续要量化,这里将`output_type`设置为`pth`\n", - "\n", - "💡 转换13B模型提示:\n", - "- 请将参数`--base_model`和`--lora_model`中的的`7b`改为`13b`即可\n", - "- **免费用户必须增加一个参数`--offload_dir`以缓解内存压力**,例如`--offload_dir ./offload_temp`\n", - "\n", - "该过程比较耗时(下载+转换),需要几分钟到十几分钟不等,请耐心等待。\n", - "转换好的模型存放在`alpaca-combined`目录。\n", - "如果你不需要量化模型,那么到这一步就结束了。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5AV4EW5hNhVV", - "outputId": "91901b82-88c4-405d-cf86-32f1a3a60467" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2023-04-28 08:07:00.276520: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "Base model: decapoda-research/llama-7b-hf\n", - "LoRA model(s) ['ziqingyang/chinese-llama-plus-lora-7b', 'ziqingyang/chinese-alpaca-plus-lora-7b']:\n", - "Loading checkpoint shards: 100% 33/33 [01:18<00:00, 2.39s/it]\n", - "Peft version: 0.3.0.dev0\n", - "Loading LoRA for 7B model\n", - "Loading LoRA ziqingyang/chinese-llama-plus-lora-7b\n", - "Extended vocabulary size to 49953\n", - "Downloading (…)/adapter_config.json: 100% 420/420 [00:00<00:00, 1.61MB/s]\n", - "Downloading adapter_model.bin: 100% 858M/858M [00:04<00:00, 185MB/s]\n", - "Merging with merge_and_unload...\n", - "Loading LoRA ziqingyang/chinese-alpaca-plus-lora-7b\n", - "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 13.4MB/s]\n", - "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 535kB/s]\n", - "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 854kB/s]\n", - "Extended vocabulary size to 49954\n", - "Downloading (…)/adapter_config.json: 100% 423/423 [00:00<00:00, 2.31MB/s]\n", - "Downloading adapter_model.bin: 100% 1.14G/1.14G [00:16<00:00, 70.6MB/s]\n", - "Merging with merge_and_unload...\n", - "Saving to pth format...\n", - "Saving shard 1 of 1 into alpaca-combined/consolidated.00.pth\n" - ] - } - ], - "source": [ - "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora.py \\\n", - " --base_model decapoda-research/llama-7b-hf \\\n", - " --lora_model ziqingyang/chinese-llama-plus-lora-7b,ziqingyang/chinese-alpaca-plus-lora-7b \\\n", - " --output_type pth \\\n", - " --output_dir alpaca-combined" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ueexcKo-Q_EW" - }, - "source": [ - "## 量化模型\n", - "接下来我们使用[llama.cpp](https://github.com/ggerganov/llama.cpp)工具对上一步生成的全量版本权重进行转换,生成4-bit量化模型。\n", - "\n", - "### 编译工具\n", - "\n", - "首先对llama.cpp工具进行编译。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_GbjsT2wRRCR", - "outputId": "2b4f2a38-d22d-4764-9a81-bad8bd72b7fe" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "I llama.cpp build info: \n", - "I UNAME_S: Linux\n", - "I UNAME_P: x86_64\n", - "I UNAME_M: x86_64\n", - "I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native\n", - "I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native\n", - "I LDFLAGS: \n", - "I CC: cc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", - "I CXX: g++ (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", - "\n", - "cc -I. -O3 -DNDEBUG -std=c11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native -c ggml.c -o ggml.o\n", - "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c llama.cpp -o llama.o\n", - "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c examples/common.cpp -o common.o\n", - "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/main/main.cpp ggml.o llama.o common.o -o main \n", - "\n", - "==== Run ./main -h for help. ====\n", - "\n", - "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize/quantize.cpp ggml.o llama.o -o quantize \n", - "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats \n", - "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity \n", - "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding \n", - "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native pocs/vdot/vdot.cpp ggml.o -o vdot \n" - ] - } - ], - "source": [ - "!cd llama.cpp && make" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gw2xpYC0RcQC" - }, - "source": [ - "### 模型转换为ggml格式(FP16)\n", - "\n", - "这一步,我们将模型转换为ggml格式(FP16)。\n", - "- 在这之前需要把`alpaca-combined`目录挪个位置,把模型文件放到`llama.cpp/zh-models/7B`下,把`tokenizer.model`放到`llama.cpp/zh-models`\n", - "- tokenizer在哪里?\n", - " - `alpaca-combined`目录下有\n", - " - 或者从以下网址下载:https://huggingface.co/ziqingyang/chinese-alpaca-lora-7b/resolve/main/tokenizer.model (注意,Alpaca和LLaMA的`tokenizer.model`不能混用!)\n", - "\n", - "💡 转换13B模型提示:\n", - "- tokenizer可以直接用7B的,13B和7B的相同\n", - "- Alpaca和LLaMA的`tokenizer.model`不能混用!\n", - "- 以下看到7B字样的都是文件夹名,与转换过程没有关系了,改不改都行" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5KgnFVStRjio", - "outputId": "19293a4a-a400-4cd3-c98b-80022dcd1f35" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "7B tokenizer.model\n" - ] - } - ], - "source": [ - "!cd llama.cpp && mkdir zh-models && mv ../alpaca-combined zh-models/7B\n", - "!mv llama.cpp/zh-models/7B/tokenizer.model llama.cpp/zh-models/\n", - "!ls llama.cpp/zh-models/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "NUHeoTMQS1AQ", - "outputId": "378b70db-d13b-4aa9-8bb0-a1fc1cd4b13f" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Loading model file zh-models/7B/consolidated.00.pth\n", - "Loading vocab file zh-models/tokenizer.model\n", - "Writing vocab...\n", - "[ 1/291] Writing tensor tok_embeddings.weight | size 49954 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 2/291] Writing tensor norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 3/291] Writing tensor output.weight | size 49954 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 4/291] Writing tensor layers.0.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 5/291] Writing tensor layers.0.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 6/291] Writing tensor layers.0.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 7/291] Writing tensor layers.0.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 8/291] Writing tensor layers.0.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 9/291] Writing tensor layers.0.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 10/291] Writing tensor layers.0.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 11/291] Writing tensor layers.0.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 12/291] Writing tensor layers.0.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 13/291] Writing tensor layers.1.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 14/291] Writing tensor layers.1.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 15/291] Writing tensor layers.1.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 16/291] Writing tensor layers.1.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 17/291] Writing tensor layers.1.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 18/291] Writing tensor layers.1.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 19/291] Writing tensor layers.1.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 20/291] Writing tensor layers.1.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 21/291] Writing tensor layers.1.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 22/291] Writing tensor layers.2.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 23/291] Writing tensor layers.2.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 24/291] Writing tensor layers.2.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 25/291] Writing tensor layers.2.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 26/291] Writing tensor layers.2.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 27/291] Writing tensor layers.2.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 28/291] Writing tensor layers.2.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 29/291] Writing tensor layers.2.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 30/291] Writing tensor layers.2.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 31/291] Writing tensor layers.3.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 32/291] Writing tensor layers.3.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 33/291] Writing tensor layers.3.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 34/291] Writing tensor layers.3.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 35/291] Writing tensor layers.3.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 36/291] Writing tensor layers.3.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 37/291] Writing tensor layers.3.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 38/291] Writing tensor layers.3.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 39/291] Writing tensor layers.3.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 40/291] Writing tensor layers.4.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 41/291] Writing tensor layers.4.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 42/291] Writing tensor layers.4.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 43/291] Writing tensor layers.4.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 44/291] Writing tensor layers.4.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 45/291] Writing tensor layers.4.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 46/291] Writing tensor layers.4.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 47/291] Writing tensor layers.4.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 48/291] Writing tensor layers.4.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 49/291] Writing tensor layers.5.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 50/291] Writing tensor layers.5.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 51/291] Writing tensor layers.5.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 52/291] Writing tensor layers.5.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 53/291] Writing tensor layers.5.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 54/291] Writing tensor layers.5.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 55/291] Writing tensor layers.5.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 56/291] Writing tensor layers.5.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 57/291] Writing tensor layers.5.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 58/291] Writing tensor layers.6.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 59/291] Writing tensor layers.6.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 60/291] Writing tensor layers.6.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 61/291] Writing tensor layers.6.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 62/291] Writing tensor layers.6.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 63/291] Writing tensor layers.6.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 64/291] Writing tensor layers.6.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 65/291] Writing tensor layers.6.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 66/291] Writing tensor layers.6.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 67/291] Writing tensor layers.7.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 68/291] Writing tensor layers.7.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 69/291] Writing tensor layers.7.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 70/291] Writing tensor layers.7.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 71/291] Writing tensor layers.7.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 72/291] Writing tensor layers.7.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 73/291] Writing tensor layers.7.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 74/291] Writing tensor layers.7.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 75/291] Writing tensor layers.7.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 76/291] Writing tensor layers.8.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 77/291] Writing tensor layers.8.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 78/291] Writing tensor layers.8.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 79/291] Writing tensor layers.8.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 80/291] Writing tensor layers.8.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 81/291] Writing tensor layers.8.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 82/291] Writing tensor layers.8.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 83/291] Writing tensor layers.8.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 84/291] Writing tensor layers.8.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 85/291] Writing tensor layers.9.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 86/291] Writing tensor layers.9.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 87/291] Writing tensor layers.9.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 88/291] Writing tensor layers.9.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 89/291] Writing tensor layers.9.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 90/291] Writing tensor layers.9.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 91/291] Writing tensor layers.9.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[ 92/291] Writing tensor layers.9.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 93/291] Writing tensor layers.9.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 94/291] Writing tensor layers.10.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 95/291] Writing tensor layers.10.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 96/291] Writing tensor layers.10.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 97/291] Writing tensor layers.10.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[ 98/291] Writing tensor layers.10.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[ 99/291] Writing tensor layers.10.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[100/291] Writing tensor layers.10.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[101/291] Writing tensor layers.10.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[102/291] Writing tensor layers.10.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[103/291] Writing tensor layers.11.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[104/291] Writing tensor layers.11.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[105/291] Writing tensor layers.11.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[106/291] Writing tensor layers.11.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[107/291] Writing tensor layers.11.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[108/291] Writing tensor layers.11.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[109/291] Writing tensor layers.11.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[110/291] Writing tensor layers.11.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[111/291] Writing tensor layers.11.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[112/291] Writing tensor layers.12.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[113/291] Writing tensor layers.12.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[114/291] Writing tensor layers.12.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[115/291] Writing tensor layers.12.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[116/291] Writing tensor layers.12.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[117/291] Writing tensor layers.12.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[118/291] Writing tensor layers.12.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[119/291] Writing tensor layers.12.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[120/291] Writing tensor layers.12.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[121/291] Writing tensor layers.13.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[122/291] Writing tensor layers.13.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[123/291] Writing tensor layers.13.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[124/291] Writing tensor layers.13.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[125/291] Writing tensor layers.13.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[126/291] Writing tensor layers.13.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[127/291] Writing tensor layers.13.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[128/291] Writing tensor layers.13.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[129/291] Writing tensor layers.13.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[130/291] Writing tensor layers.14.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[131/291] Writing tensor layers.14.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[132/291] Writing tensor layers.14.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[133/291] Writing tensor layers.14.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[134/291] Writing tensor layers.14.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[135/291] Writing tensor layers.14.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[136/291] Writing tensor layers.14.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[137/291] Writing tensor layers.14.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[138/291] Writing tensor layers.14.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[139/291] Writing tensor layers.15.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[140/291] Writing tensor layers.15.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[141/291] Writing tensor layers.15.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[142/291] Writing tensor layers.15.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[143/291] Writing tensor layers.15.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[144/291] Writing tensor layers.15.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[145/291] Writing tensor layers.15.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[146/291] Writing tensor layers.15.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[147/291] Writing tensor layers.15.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[148/291] Writing tensor layers.16.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[149/291] Writing tensor layers.16.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[150/291] Writing tensor layers.16.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[151/291] Writing tensor layers.16.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[152/291] Writing tensor layers.16.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[153/291] Writing tensor layers.16.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[154/291] Writing tensor layers.16.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[155/291] Writing tensor layers.16.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[156/291] Writing tensor layers.16.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[157/291] Writing tensor layers.17.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[158/291] Writing tensor layers.17.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[159/291] Writing tensor layers.17.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[160/291] Writing tensor layers.17.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[161/291] Writing tensor layers.17.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[162/291] Writing tensor layers.17.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[163/291] Writing tensor layers.17.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[164/291] Writing tensor layers.17.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[165/291] Writing tensor layers.17.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[166/291] Writing tensor layers.18.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[167/291] Writing tensor layers.18.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[168/291] Writing tensor layers.18.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[169/291] Writing tensor layers.18.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[170/291] Writing tensor layers.18.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[171/291] Writing tensor layers.18.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[172/291] Writing tensor layers.18.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[173/291] Writing tensor layers.18.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[174/291] Writing tensor layers.18.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[175/291] Writing tensor layers.19.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[176/291] Writing tensor layers.19.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[177/291] Writing tensor layers.19.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[178/291] Writing tensor layers.19.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[179/291] Writing tensor layers.19.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[180/291] Writing tensor layers.19.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[181/291] Writing tensor layers.19.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[182/291] Writing tensor layers.19.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[183/291] Writing tensor layers.19.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[184/291] Writing tensor layers.20.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[185/291] Writing tensor layers.20.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[186/291] Writing tensor layers.20.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[187/291] Writing tensor layers.20.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[188/291] Writing tensor layers.20.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[189/291] Writing tensor layers.20.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[190/291] Writing tensor layers.20.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[191/291] Writing tensor layers.20.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[192/291] Writing tensor layers.20.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[193/291] Writing tensor layers.21.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[194/291] Writing tensor layers.21.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[195/291] Writing tensor layers.21.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[196/291] Writing tensor layers.21.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[197/291] Writing tensor layers.21.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[198/291] Writing tensor layers.21.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[199/291] Writing tensor layers.21.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[200/291] Writing tensor layers.21.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[201/291] Writing tensor layers.21.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[202/291] Writing tensor layers.22.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[203/291] Writing tensor layers.22.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[204/291] Writing tensor layers.22.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[205/291] Writing tensor layers.22.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[206/291] Writing tensor layers.22.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[207/291] Writing tensor layers.22.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[208/291] Writing tensor layers.22.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[209/291] Writing tensor layers.22.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[210/291] Writing tensor layers.22.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[211/291] Writing tensor layers.23.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[212/291] Writing tensor layers.23.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[213/291] Writing tensor layers.23.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[214/291] Writing tensor layers.23.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[215/291] Writing tensor layers.23.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[216/291] Writing tensor layers.23.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[217/291] Writing tensor layers.23.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[218/291] Writing tensor layers.23.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[219/291] Writing tensor layers.23.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[220/291] Writing tensor layers.24.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[221/291] Writing tensor layers.24.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[222/291] Writing tensor layers.24.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[223/291] Writing tensor layers.24.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[224/291] Writing tensor layers.24.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[225/291] Writing tensor layers.24.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[226/291] Writing tensor layers.24.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[227/291] Writing tensor layers.24.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[228/291] Writing tensor layers.24.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[229/291] Writing tensor layers.25.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[230/291] Writing tensor layers.25.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[231/291] Writing tensor layers.25.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[232/291] Writing tensor layers.25.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[233/291] Writing tensor layers.25.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[234/291] Writing tensor layers.25.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[235/291] Writing tensor layers.25.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[236/291] Writing tensor layers.25.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[237/291] Writing tensor layers.25.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[238/291] Writing tensor layers.26.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[239/291] Writing tensor layers.26.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[240/291] Writing tensor layers.26.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[241/291] Writing tensor layers.26.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[242/291] Writing tensor layers.26.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[243/291] Writing tensor layers.26.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[244/291] Writing tensor layers.26.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[245/291] Writing tensor layers.26.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[246/291] Writing tensor layers.26.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[247/291] Writing tensor layers.27.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[248/291] Writing tensor layers.27.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[249/291] Writing tensor layers.27.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[250/291] Writing tensor layers.27.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[251/291] Writing tensor layers.27.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[252/291] Writing tensor layers.27.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[253/291] Writing tensor layers.27.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[254/291] Writing tensor layers.27.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[255/291] Writing tensor layers.27.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[256/291] Writing tensor layers.28.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[257/291] Writing tensor layers.28.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[258/291] Writing tensor layers.28.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[259/291] Writing tensor layers.28.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[260/291] Writing tensor layers.28.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[261/291] Writing tensor layers.28.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[262/291] Writing tensor layers.28.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[263/291] Writing tensor layers.28.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[264/291] Writing tensor layers.28.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[265/291] Writing tensor layers.29.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[266/291] Writing tensor layers.29.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[267/291] Writing tensor layers.29.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[268/291] Writing tensor layers.29.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[269/291] Writing tensor layers.29.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[270/291] Writing tensor layers.29.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[271/291] Writing tensor layers.29.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[272/291] Writing tensor layers.29.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[273/291] Writing tensor layers.29.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[274/291] Writing tensor layers.30.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[275/291] Writing tensor layers.30.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[276/291] Writing tensor layers.30.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[277/291] Writing tensor layers.30.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[278/291] Writing tensor layers.30.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[279/291] Writing tensor layers.30.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[280/291] Writing tensor layers.30.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[281/291] Writing tensor layers.30.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[282/291] Writing tensor layers.30.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[283/291] Writing tensor layers.31.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[284/291] Writing tensor layers.31.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[285/291] Writing tensor layers.31.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[286/291] Writing tensor layers.31.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", - "[287/291] Writing tensor layers.31.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "[288/291] Writing tensor layers.31.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[289/291] Writing tensor layers.31.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", - "[290/291] Writing tensor layers.31.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", - "[291/291] Writing tensor layers.31.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", - "Wrote zh-models/7B/ggml-model-f16.bin\n" - ] - } - ], - "source": [ - "!cd llama.cpp && python convert.py zh-models/7B/" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hEZEJAVYCHkc" - }, - "source": [ - "### 将FP16模型量化为8-bit\n", - "\n", - "我们进一步将FP16模型转换为8-bit量化模型。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2xyais7OUVDI", - "outputId": "b7fe3c62-489a-42e5-927a-8ab6088a3ecc" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "llama.cpp: loading model from ./zh-models/7B/ggml-model-f16.bin\n", - "llama.cpp: saving model to ./zh-models/7B/ggml-model-q4_0.bin\n", - "[ 1/ 291] tok_embeddings.weight - 4096 x 49954, type = f16, quantizing .. size = 390.27 MB -> 219.52 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 2/ 291] norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 3/ 291] output.weight - 4096 x 49954, type = f16, quantizing .. size = 390.27 MB -> 219.52 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 4/ 291] layers.0.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.026 0.018 0.028 0.044 0.064 0.088 0.111 0.245 0.111 0.087 0.064 0.044 0.028 0.018 0.026 \n", - "[ 5/ 291] layers.0.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.026 0.017 0.028 0.043 0.063 0.087 0.111 0.250 0.112 0.087 0.063 0.043 0.028 0.017 0.026 \n", - "[ 6/ 291] layers.0.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.019 0.031 0.046 0.065 0.087 0.107 0.237 0.107 0.087 0.065 0.046 0.030 0.019 0.027 \n", - "[ 7/ 291] layers.0.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.026 0.017 0.027 0.042 0.062 0.087 0.113 0.253 0.113 0.087 0.062 0.042 0.027 0.017 0.026 \n", - "[ 8/ 291] layers.0.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 9/ 291] layers.0.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 10/ 291] layers.0.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 11/ 291] layers.0.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 12/ 291] layers.0.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 13/ 291] layers.1.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.107 0.228 0.107 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 14/ 291] layers.1.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.019 0.031 0.047 0.067 0.088 0.107 0.229 0.107 0.088 0.067 0.047 0.031 0.019 0.027 \n", - "[ 15/ 291] layers.1.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.019 0.030 0.046 0.066 0.088 0.108 0.235 0.108 0.088 0.065 0.046 0.030 0.019 0.027 \n", - "[ 16/ 291] layers.1.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.026 0.017 0.027 0.042 0.062 0.087 0.113 0.256 0.113 0.086 0.062 0.042 0.027 0.017 0.026 \n", - "[ 17/ 291] layers.1.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 18/ 291] layers.1.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 19/ 291] layers.1.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 20/ 291] layers.1.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 21/ 291] layers.1.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 22/ 291] layers.2.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 23/ 291] layers.2.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.019 0.031 0.047 0.066 0.088 0.107 0.231 0.107 0.088 0.066 0.047 0.031 0.019 0.027 \n", - "[ 24/ 291] layers.2.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.087 0.106 0.228 0.106 0.087 0.067 0.047 0.031 0.020 0.027 \n", - "[ 25/ 291] layers.2.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.107 0.228 0.107 0.088 0.067 0.047 0.031 0.019 0.027 \n", - "[ 26/ 291] layers.2.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 27/ 291] layers.2.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 28/ 291] layers.2.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 29/ 291] layers.2.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 30/ 291] layers.2.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 31/ 291] layers.3.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 32/ 291] layers.3.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.229 0.106 0.088 0.066 0.047 0.031 0.020 0.027 \n", - "[ 33/ 291] layers.3.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 34/ 291] layers.3.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 35/ 291] layers.3.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 36/ 291] layers.3.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 37/ 291] layers.3.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 38/ 291] layers.3.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 39/ 291] layers.3.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 40/ 291] layers.4.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 41/ 291] layers.4.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 42/ 291] layers.4.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 43/ 291] layers.4.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 44/ 291] layers.4.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 45/ 291] layers.4.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 46/ 291] layers.4.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 47/ 291] layers.4.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 48/ 291] layers.4.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 49/ 291] layers.5.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 50/ 291] layers.5.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 51/ 291] layers.5.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 52/ 291] layers.5.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 53/ 291] layers.5.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 54/ 291] layers.5.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 55/ 291] layers.5.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 56/ 291] layers.5.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 57/ 291] layers.5.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 58/ 291] layers.6.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 59/ 291] layers.6.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n", - "[ 60/ 291] layers.6.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 61/ 291] layers.6.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 62/ 291] layers.6.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 63/ 291] layers.6.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 64/ 291] layers.6.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 65/ 291] layers.6.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 66/ 291] layers.6.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 67/ 291] layers.7.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 68/ 291] layers.7.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 69/ 291] layers.7.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 70/ 291] layers.7.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 71/ 291] layers.7.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 72/ 291] layers.7.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 73/ 291] layers.7.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n", - "[ 74/ 291] layers.7.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 75/ 291] layers.7.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 76/ 291] layers.8.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 77/ 291] layers.8.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n", - "[ 78/ 291] layers.8.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 79/ 291] layers.8.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 80/ 291] layers.8.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 81/ 291] layers.8.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 82/ 291] layers.8.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 83/ 291] layers.8.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 84/ 291] layers.8.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 85/ 291] layers.9.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 86/ 291] layers.9.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 87/ 291] layers.9.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 88/ 291] layers.9.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 89/ 291] layers.9.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 90/ 291] layers.9.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 91/ 291] layers.9.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 92/ 291] layers.9.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 93/ 291] layers.9.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 94/ 291] layers.10.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 95/ 291] layers.10.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 96/ 291] layers.10.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 97/ 291] layers.10.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 98/ 291] layers.10.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 99/ 291] layers.10.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 100/ 291] layers.10.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 101/ 291] layers.10.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 102/ 291] layers.10.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 103/ 291] layers.11.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 104/ 291] layers.11.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 105/ 291] layers.11.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 106/ 291] layers.11.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 107/ 291] layers.11.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 108/ 291] layers.11.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 109/ 291] layers.11.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 110/ 291] layers.11.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 111/ 291] layers.11.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 112/ 291] layers.12.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 113/ 291] layers.12.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 114/ 291] layers.12.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 115/ 291] layers.12.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 116/ 291] layers.12.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 117/ 291] layers.12.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 118/ 291] layers.12.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 119/ 291] layers.12.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 120/ 291] layers.12.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 121/ 291] layers.13.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 122/ 291] layers.13.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 123/ 291] layers.13.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 124/ 291] layers.13.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 125/ 291] layers.13.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 126/ 291] layers.13.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 127/ 291] layers.13.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 128/ 291] layers.13.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 129/ 291] layers.13.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 130/ 291] layers.14.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 131/ 291] layers.14.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 132/ 291] layers.14.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 133/ 291] layers.14.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 134/ 291] layers.14.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 135/ 291] layers.14.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 136/ 291] layers.14.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 137/ 291] layers.14.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 138/ 291] layers.14.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 139/ 291] layers.15.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 140/ 291] layers.15.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 141/ 291] layers.15.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 142/ 291] layers.15.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 143/ 291] layers.15.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 144/ 291] layers.15.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 145/ 291] layers.15.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 146/ 291] layers.15.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 147/ 291] layers.15.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 148/ 291] layers.16.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 149/ 291] layers.16.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 150/ 291] layers.16.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 151/ 291] layers.16.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 152/ 291] layers.16.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 153/ 291] layers.16.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 154/ 291] layers.16.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 155/ 291] layers.16.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 156/ 291] layers.16.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 157/ 291] layers.17.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 158/ 291] layers.17.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 159/ 291] layers.17.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 160/ 291] layers.17.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 161/ 291] layers.17.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 162/ 291] layers.17.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 163/ 291] layers.17.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n", - "[ 164/ 291] layers.17.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 165/ 291] layers.17.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 166/ 291] layers.18.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 167/ 291] layers.18.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 168/ 291] layers.18.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 169/ 291] layers.18.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 170/ 291] layers.18.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 171/ 291] layers.18.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 172/ 291] layers.18.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 173/ 291] layers.18.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 174/ 291] layers.18.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 175/ 291] layers.19.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 176/ 291] layers.19.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 177/ 291] layers.19.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 178/ 291] layers.19.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 179/ 291] layers.19.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 180/ 291] layers.19.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 181/ 291] layers.19.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 182/ 291] layers.19.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 183/ 291] layers.19.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 184/ 291] layers.20.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 185/ 291] layers.20.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 186/ 291] layers.20.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 187/ 291] layers.20.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 188/ 291] layers.20.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 189/ 291] layers.20.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 190/ 291] layers.20.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 191/ 291] layers.20.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 192/ 291] layers.20.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 193/ 291] layers.21.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 194/ 291] layers.21.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 195/ 291] layers.21.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 196/ 291] layers.21.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 197/ 291] layers.21.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 198/ 291] layers.21.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 199/ 291] layers.21.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 200/ 291] layers.21.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 201/ 291] layers.21.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 202/ 291] layers.22.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 203/ 291] layers.22.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 204/ 291] layers.22.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 205/ 291] layers.22.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 206/ 291] layers.22.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 207/ 291] layers.22.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 208/ 291] layers.22.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 209/ 291] layers.22.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 210/ 291] layers.22.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 211/ 291] layers.23.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 212/ 291] layers.23.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 213/ 291] layers.23.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 214/ 291] layers.23.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 215/ 291] layers.23.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 216/ 291] layers.23.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 217/ 291] layers.23.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 218/ 291] layers.23.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 219/ 291] layers.23.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 220/ 291] layers.24.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 221/ 291] layers.24.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 222/ 291] layers.24.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 223/ 291] layers.24.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 224/ 291] layers.24.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 225/ 291] layers.24.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 226/ 291] layers.24.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 227/ 291] layers.24.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 228/ 291] layers.24.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 229/ 291] layers.25.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 230/ 291] layers.25.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 231/ 291] layers.25.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 232/ 291] layers.25.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 233/ 291] layers.25.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 234/ 291] layers.25.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 235/ 291] layers.25.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 236/ 291] layers.25.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 237/ 291] layers.25.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 238/ 291] layers.26.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 239/ 291] layers.26.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 240/ 291] layers.26.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 241/ 291] layers.26.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 242/ 291] layers.26.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 243/ 291] layers.26.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.068 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 244/ 291] layers.26.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 245/ 291] layers.26.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 246/ 291] layers.26.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 247/ 291] layers.27.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 248/ 291] layers.27.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 249/ 291] layers.27.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 250/ 291] layers.27.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 251/ 291] layers.27.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 252/ 291] layers.27.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 253/ 291] layers.27.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 254/ 291] layers.27.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 255/ 291] layers.27.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 256/ 291] layers.28.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 257/ 291] layers.28.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 258/ 291] layers.28.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 259/ 291] layers.28.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 260/ 291] layers.28.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 261/ 291] layers.28.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 262/ 291] layers.28.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 263/ 291] layers.28.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 264/ 291] layers.28.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 265/ 291] layers.29.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 266/ 291] layers.29.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 267/ 291] layers.29.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 268/ 291] layers.29.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 269/ 291] layers.29.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 270/ 291] layers.29.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 271/ 291] layers.29.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.107 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 272/ 291] layers.29.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 273/ 291] layers.29.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 274/ 291] layers.30.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 275/ 291] layers.30.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 276/ 291] layers.30.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", - "[ 277/ 291] layers.30.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 278/ 291] layers.30.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 279/ 291] layers.30.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 280/ 291] layers.30.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.019 0.030 0.046 0.066 0.088 0.108 0.232 0.108 0.088 0.066 0.046 0.031 0.019 0.027 \n", - "[ 281/ 291] layers.30.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 282/ 291] layers.30.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 283/ 291] layers.31.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 284/ 291] layers.31.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 285/ 291] layers.31.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", - "[ 286/ 291] layers.31.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 287/ 291] layers.31.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "[ 288/ 291] layers.31.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 289/ 291] layers.31.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.019 0.030 0.045 0.065 0.088 0.109 0.237 0.109 0.088 0.065 0.045 0.030 0.019 0.027 \n", - "[ 290/ 291] layers.31.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "[ 291/ 291] layers.31.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", - "llama_model_quantize_internal: model size = 13133.55 MB\n", - "llama_model_quantize_internal: quant size = 7388.06 MB\n", - "llama_model_quantize_internal: hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", - "\n", - "main: quantize time = 146381.23 ms\n", - "main: total time = 146381.23 ms\n" - ] - } - ], - "source": [ - "!cd llama.cpp && ./quantize ./zh-models/7B/ggml-model-f16.bin ./zh-models/7B/ggml-model-q8_0.bin 7" - ] - }, - { - "cell_type": "code", - "source": [ - "!sha256sum ./llama.cpp/zh-models/7B/ggml-model-q8_0.bin" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2PR5jo2P-hOw", - "outputId": "2d808543-557d-4d0a-becb-ab35c4ccb8ff" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0eec8927427f159397c79961a28d62d78849514a4a19033b247edd6ac3fc2cfd ./llama.cpp/zh-models/7B/ggml-model-q8_0.bin\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DLkuRAo9Vkb1" - }, - "source": [ - "### (可选)测试量化模型解码\n", - "至此已完成了所有转换步骤。\n", - "我们运行一条命令测试一下是否能够正常加载并进行对话。\n", - "\n", - "FP16和Q8量化文件存放在./llama.cpp/zh-models/7B下,可按需下载使用。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tW-ep1BsVQtG", - "outputId": "b3b28e5e-c731-4bb5-d3ae-c09d4c7bfb81" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "main: seed = 1682671021\n", - "llama.cpp: loading model from ./zh-models/7B/ggml-model-q8_0.bin\n", - "llama_model_load_internal: format = ggjt v1 (latest)\n", - "llama_model_load_internal: n_vocab = 49954\n", - "llama_model_load_internal: n_ctx = 512\n", - "llama_model_load_internal: n_embd = 4096\n", - "llama_model_load_internal: n_mult = 256\n", - "llama_model_load_internal: n_head = 32\n", - "llama_model_load_internal: n_layer = 32\n", - "llama_model_load_internal: n_rot = 128\n", - "llama_model_load_internal: ftype = 7 (mostly Q8_0)\n", - "llama_model_load_internal: n_ff = 11008\n", - "llama_model_load_internal: n_parts = 1\n", - "llama_model_load_internal: model size = 7B\n", - "llama_model_load_internal: ggml ctx size = 59.11 KB\n", - "llama_model_load_internal: mem required = 9180.12 MB (+ 1026.00 MB per state)\n", - "llama_init_from_file: kv self size = 256.00 MB\n", - "\n", - "system_info: n_threads = 4 / 4 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n", - "sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.100000\n", - "generate: n_ctx = 512, n_batch = 512, n_predict = 512, n_keep = 0\n", - "\n", - "\n", - "\u001b[33m 详细介绍一下北京的名胜古迹:\u001b[0m长城、故宫等。同时介绍一些小众景点,比如颐和园中的石舫、圆明园中的琉璃花门等等。 [end of text]\n", - "\n", - "llama_print_timings: load time = 19881.66 ms\n", - "llama_print_timings: sample time = 48.31 ms / 32 runs ( 1.51 ms per run)\n", - "llama_print_timings: prompt eval time = 11365.17 ms / 11 tokens ( 1033.20 ms per token)\n", - "llama_print_timings: eval time = 33910.03 ms / 31 runs ( 1093.87 ms per run)\n", - "llama_print_timings: total time = 53841.09 ms\n" - ] - } - ], - "source": [ - "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q8_0.bin --color -f ./prompts/alpaca.txt -p \"详细介绍一下北京的名胜古迹:\" -n 512" - ] - } - ], - "metadata": { - "accelerator": "TPU", - "colab": { - "machine_shape": "hm", - "provenance": [] - }, - "gpuClass": "premium", - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/notebooks/convert_and_quantize_chinese_llama.ipynb b/notebooks/convert_and_quantize_chinese_llama.ipynb deleted file mode 100644 index ce077f3..0000000 --- a/notebooks/convert_and_quantize_chinese_llama.ipynb +++ /dev/null @@ -1,1874 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "machine_shape": "hm" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "gpuClass": "standard", - "accelerator": "TPU" - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# 转换并量化中文LLaMA/Alpaca模型\n", - "\n", - "🎉🎉🎉 **新:现在免费用户也有机会能够转换7B和13B模型了!**\n", - "\n", - "💡 提示和小窍门:\n", - "- 免费用户默认的内存只有12G左右,**笔者用免费账号实测选择TPU的话有机会随机出35G内存**,建议多试几次。如果能随机出25G内存以上的机器就可以了转换7B模型了,35G内存以上机器就能转换13B模型了\n", - "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n", - "- 实测:转换7B级别模型,25G内存的机器就够了;转换13B级别模型需要30G以上的内存(程序莫名崩掉或断开连接就说明内存爆了)\n", - "- 如果选了“高RAM”之后内存还是不够大的话,选择以下操作,有的时候会分配出很高内存的机器,祝你好运😄!\n", - " - 可以把GPU或者TPU也选上(虽然不会用到)\n", - " - 选GPU时,Pro用户可选“高级”类型GPU\n", - "\n", - "以下信息配置信息供参考(Pro订阅下测试),运行时规格设置为“高RAM”时的设备配置如下(有随机性):\n", - "\n", - "| 硬件加速器 | RAM | 硬盘 |\n", - "| :-- | :--: | :--: |\n", - "| None | 25GB | 225GB |\n", - "| TPU | 35GB | 225GB |\n", - "| GPU(标准,T4)| 25GB | 166GB |\n", - "| GPU(高性能,V100)| 25GB | 166GB |\n", - "| GPU(高性能,A100)| **80GB** | 166GB |\n", - "\n", - "*温馨提示:用完之后注意断开运行时,选择满足要求的最低配置即可,避免不必要的计算单元消耗(Pro只给100个计算单元)。*" - ], - "metadata": { - "id": "B1c96_k3MahN" - } - }, - { - "cell_type": "markdown", - "source": [ - "## 安装相关依赖" - ], - "metadata": { - "id": "vScqHD_jMFOV" - } - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "E5WKFJXIL6ZU", - "outputId": "7ce317e5-c105-49a8-d1af-70c29e6246e1" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting transformers\n", - " Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.0/7.0 MB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.27.1)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.24.2)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (6.0)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.65.0)\n", - "Collecting huggingface-hub<1.0,>=0.11.0\n", - " Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m200.1/200.1 kB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.11.0)\n", - "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n", - " Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m97.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2022.12.7)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.15)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (3.4)\n", - "Installing collected packages: tokenizers, huggingface-hub, transformers\n", - "Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting peft\n", - " Downloading peft-0.2.0-py3-none-any.whl (40 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.3/40.3 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.9/dist-packages (from peft) (5.9.4)\n", - "Requirement already satisfied: transformers in /usr/local/lib/python3.9/dist-packages (from peft) (4.28.0)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.9/dist-packages (from peft) (6.0)\n", - "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.9/dist-packages (from peft) (2.0.0+cu118)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from peft) (23.0)\n", - "Collecting accelerate\n", - " Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.3/215.3 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from peft) (1.24.2)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (1.11.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.1.2)\n", - "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (2.0.0)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.1)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.11.0)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (4.5.0)\n", - "Requirement already satisfied: lit in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch>=1.13.0->peft) (16.0.1)\n", - "Requirement already satisfied: cmake in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch>=1.13.0->peft) (3.25.2)\n", - "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (0.13.4)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (2022.10.31)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (2.27.1)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (0.13.3)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (4.65.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->torch>=1.13.0->peft) (2.1.2)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (2.0.12)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (3.4)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (1.26.15)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (2022.12.7)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.9/dist-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\n", - "Installing collected packages: accelerate, peft\n", - "Successfully installed accelerate-0.18.0 peft-0.2.0\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting sentencepiece\n", - " Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: sentencepiece\n", - "Successfully installed sentencepiece-0.1.98\n" - ] - } - ], - "source": [ - "!pip install transformers\n", - "!pip install peft\n", - "!pip install sentencepiece" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## 克隆目录和代码" - ], - "metadata": { - "id": "ygb1xFIMNQKw" - } - }, - { - "cell_type": "code", - "source": [ - "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n", - "!git clone https://github.com/ggerganov/llama.cpp" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "yCEJh7NJNXz9", - "outputId": "91a0e4ff-af63-4f8e-ab82-ee4ddf583033" - }, - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Cloning into 'Chinese-LLaMA-Alpaca'...\n", - "remote: Enumerating objects: 559, done.\u001b[K\n", - "remote: Counting objects: 100% (129/129), done.\u001b[K\n", - "remote: Compressing objects: 100% (115/115), done.\u001b[K\n", - "remote: Total 559 (delta 30), reused 22 (delta 14), pack-reused 430\u001b[K\n", - "Receiving objects: 100% (559/559), 10.71 MiB | 25.49 MiB/s, done.\n", - "Resolving deltas: 100% (333/333), done.\n", - "Cloning into 'llama.cpp'...\n", - "remote: Enumerating objects: 1701, done.\u001b[K\n", - "remote: Counting objects: 100% (1701/1701), done.\u001b[K\n", - "remote: Compressing objects: 100% (620/620), done.\u001b[K\n", - "remote: Total 1701 (delta 1084), reused 1623 (delta 1047), pack-reused 0\u001b[K\n", - "Receiving objects: 100% (1701/1701), 1.86 MiB | 14.74 MiB/s, done.\n", - "Resolving deltas: 100% (1084/1084), done.\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## 合并模型(以Alpaca-7B为例)\n", - "\n", - "**⚠️ 再次提醒:7B模型需要25G内存,13B模型需要35G+内存。**\n", - "\n", - "此处使用的是🤗模型库中提供的基模型(已是HF格式),而不是Facebook官方的LLaMA模型,因此略去将原版LLaMA转换为HF格式的步骤。\n", - "\n", - "**这里直接运行第二步:合并LoRA权重**,生成全量模型权重。可以直接指定🤗模型库的地址,也可以是本地存放地址。\n", - "- 基模型:`decapoda-research/llama-7b-hf` *(use at your own risk)*\n", - "- LoRA模型:`ziqingyang/chinese-alpaca-lora-7b`\n", - "\n", - "💡 转换13B模型提示:\n", - "- 请将参数`--base_model`和`--lora_model`中的的`7b`改为`13b`即可\n", - "- **免费用户必须增加一个参数`--offload_dir`以缓解内存压力**,例如`--offload_dir ./offload_temp`\n", - "\n", - "该过程比较耗时(下载+转换),需要几分钟到十几分钟不等,请耐心等待。\n", - "转换好的模型存放在`alpaca-combined`目录。\n", - "如果你不需要量化模型,那么到这一步就结束了。" - ], - "metadata": { - "id": "nIyxX0DSNsgQ" - } - }, - { - "cell_type": "code", - "source": [ - "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora.py \\\n", - " --base_model 'decapoda-research/llama-7b-hf' \\\n", - " --lora_model 'ziqingyang/chinese-alpaca-lora-7b' \\\n", - " --output_dir alpaca-combined" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5AV4EW5hNhVV", - "outputId": "e34419d4-b7c9-4e22-af37-abf80d4163ba" - }, - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2023-04-14 10:13:45.382526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 12.7MB/s]\n", - "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 15.3kB/s]\n", - "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 63.2kB/s]\n", - "Downloading (…)lve/main/config.json: 100% 427/427 [00:00<00:00, 63.4kB/s]\n", - "Downloading (…)model.bin.index.json: 100% 25.5k/25.5k [00:00<00:00, 9.41MB/s]\n", - "Downloading shards: 0% 0/33 [00:00 121.96 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[2/291] norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[3/291] output.weight - [4096 x 49954], type = f16, quantizing .. size = 390.27 MB -> 121.96 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[4/291] layers.0.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.071 0.103 0.137 0.158 0.137 0.103 0.071 0.046 0.028 0.016 0.021 \n", - "[5/291] layers.0.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.016 0.027 0.045 0.071 0.104 0.138 0.158 0.139 0.104 0.071 0.045 0.027 0.016 0.021 \n", - "[6/291] layers.0.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.018 0.032 0.051 0.076 0.103 0.128 0.141 0.128 0.103 0.075 0.051 0.032 0.019 0.022 \n", - "[7/291] layers.0.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.072 0.105 0.136 0.151 0.136 0.105 0.072 0.046 0.028 0.016 0.021 \n", - "[8/291] layers.0.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[9/291] layers.0.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[10/291] layers.0.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[11/291] layers.0.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[12/291] layers.0.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[13/291] layers.1.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.051 0.077 0.104 0.127 0.137 0.127 0.104 0.077 0.051 0.032 0.019 0.022 \n", - "[14/291] layers.1.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.018 0.032 0.051 0.076 0.104 0.128 0.138 0.128 0.104 0.077 0.051 0.032 0.018 0.022 \n", - "[15/291] layers.1.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.018 0.031 0.051 0.076 0.104 0.129 0.139 0.129 0.104 0.076 0.051 0.031 0.018 0.021 \n", - "[16/291] layers.1.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.071 0.104 0.137 0.154 0.137 0.104 0.071 0.046 0.028 0.016 0.021 \n", - "[17/291] layers.1.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[18/291] layers.1.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[19/291] layers.1.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[20/291] layers.1.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[21/291] layers.1.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[22/291] layers.2.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[23/291] layers.2.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.051 0.076 0.104 0.127 0.137 0.127 0.104 0.077 0.051 0.032 0.019 0.022 \n", - "[24/291] layers.2.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.136 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[25/291] layers.2.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", - "[26/291] layers.2.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[27/291] layers.2.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[28/291] layers.2.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[29/291] layers.2.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[30/291] layers.2.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[31/291] layers.3.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", - "[32/291] layers.3.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.136 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", - "[33/291] layers.3.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[34/291] layers.3.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[35/291] layers.3.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[36/291] layers.3.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[37/291] layers.3.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[38/291] layers.3.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[39/291] layers.3.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[40/291] layers.4.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[41/291] layers.4.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", - "[42/291] layers.4.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[43/291] layers.4.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[44/291] layers.4.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[45/291] layers.4.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[46/291] layers.4.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[47/291] layers.4.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[48/291] layers.4.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[49/291] layers.5.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[50/291] layers.5.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[51/291] layers.5.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[52/291] layers.5.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[53/291] layers.5.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[54/291] layers.5.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[55/291] layers.5.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[56/291] layers.5.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[57/291] layers.5.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[58/291] layers.6.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[59/291] layers.6.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[60/291] layers.6.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[61/291] layers.6.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[62/291] layers.6.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[63/291] layers.6.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[64/291] layers.6.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[65/291] layers.6.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[66/291] layers.6.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[67/291] layers.7.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[68/291] layers.7.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[69/291] layers.7.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[70/291] layers.7.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[71/291] layers.7.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[72/291] layers.7.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[73/291] layers.7.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[74/291] layers.7.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[75/291] layers.7.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[76/291] layers.8.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[77/291] layers.8.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n", - "[78/291] layers.8.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[79/291] layers.8.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[80/291] layers.8.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[81/291] layers.8.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[82/291] layers.8.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n", - "[83/291] layers.8.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[84/291] layers.8.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[85/291] layers.9.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[86/291] layers.9.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[87/291] layers.9.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[88/291] layers.9.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[89/291] layers.9.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[90/291] layers.9.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[91/291] layers.9.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[92/291] layers.9.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[93/291] layers.9.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[94/291] layers.10.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[95/291] layers.10.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[96/291] layers.10.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[97/291] layers.10.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[98/291] layers.10.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[99/291] layers.10.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[100/291] layers.10.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[101/291] layers.10.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[102/291] layers.10.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[103/291] layers.11.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[104/291] layers.11.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[105/291] layers.11.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[106/291] layers.11.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[107/291] layers.11.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[108/291] layers.11.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[109/291] layers.11.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", - "[110/291] layers.11.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[111/291] layers.11.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[112/291] layers.12.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[113/291] layers.12.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[114/291] layers.12.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[115/291] layers.12.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[116/291] layers.12.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[117/291] layers.12.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[118/291] layers.12.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[119/291] layers.12.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[120/291] layers.12.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[121/291] layers.13.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[122/291] layers.13.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n", - "[123/291] layers.13.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[124/291] layers.13.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[125/291] layers.13.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[126/291] layers.13.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[127/291] layers.13.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n", - "[128/291] layers.13.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[129/291] layers.13.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[130/291] layers.14.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[131/291] layers.14.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[132/291] layers.14.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[133/291] layers.14.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[134/291] layers.14.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[135/291] layers.14.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[136/291] layers.14.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[137/291] layers.14.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[138/291] layers.14.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[139/291] layers.15.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[140/291] layers.15.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[141/291] layers.15.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[142/291] layers.15.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[143/291] layers.15.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[144/291] layers.15.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[145/291] layers.15.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[146/291] layers.15.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[147/291] layers.15.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[148/291] layers.16.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[149/291] layers.16.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[150/291] layers.16.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[151/291] layers.16.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[152/291] layers.16.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[153/291] layers.16.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[154/291] layers.16.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n", - "[155/291] layers.16.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[156/291] layers.16.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[157/291] layers.17.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[158/291] layers.17.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[159/291] layers.17.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[160/291] layers.17.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[161/291] layers.17.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[162/291] layers.17.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[163/291] layers.17.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n", - "[164/291] layers.17.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[165/291] layers.17.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[166/291] layers.18.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[167/291] layers.18.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[168/291] layers.18.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[169/291] layers.18.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[170/291] layers.18.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[171/291] layers.18.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[172/291] layers.18.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[173/291] layers.18.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[174/291] layers.18.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[175/291] layers.19.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[176/291] layers.19.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[177/291] layers.19.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[178/291] layers.19.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[179/291] layers.19.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[180/291] layers.19.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[181/291] layers.19.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[182/291] layers.19.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[183/291] layers.19.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[184/291] layers.20.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[185/291] layers.20.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[186/291] layers.20.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[187/291] layers.20.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[188/291] layers.20.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[189/291] layers.20.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[190/291] layers.20.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[191/291] layers.20.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[192/291] layers.20.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[193/291] layers.21.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[194/291] layers.21.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[195/291] layers.21.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[196/291] layers.21.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[197/291] layers.21.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[198/291] layers.21.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[199/291] layers.21.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[200/291] layers.21.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[201/291] layers.21.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[202/291] layers.22.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[203/291] layers.22.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[204/291] layers.22.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[205/291] layers.22.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[206/291] layers.22.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[207/291] layers.22.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[208/291] layers.22.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[209/291] layers.22.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[210/291] layers.22.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[211/291] layers.23.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[212/291] layers.23.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[213/291] layers.23.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[214/291] layers.23.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[215/291] layers.23.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[216/291] layers.23.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[217/291] layers.23.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[218/291] layers.23.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[219/291] layers.23.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[220/291] layers.24.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[221/291] layers.24.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[222/291] layers.24.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[223/291] layers.24.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[224/291] layers.24.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[225/291] layers.24.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[226/291] layers.24.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[227/291] layers.24.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[228/291] layers.24.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[229/291] layers.25.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[230/291] layers.25.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[231/291] layers.25.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[232/291] layers.25.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[233/291] layers.25.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[234/291] layers.25.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[235/291] layers.25.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[236/291] layers.25.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[237/291] layers.25.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[238/291] layers.26.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[239/291] layers.26.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[240/291] layers.26.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[241/291] layers.26.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[242/291] layers.26.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[243/291] layers.26.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[244/291] layers.26.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[245/291] layers.26.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[246/291] layers.26.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[247/291] layers.27.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[248/291] layers.27.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[249/291] layers.27.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[250/291] layers.27.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[251/291] layers.27.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[252/291] layers.27.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[253/291] layers.27.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[254/291] layers.27.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[255/291] layers.27.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[256/291] layers.28.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[257/291] layers.28.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[258/291] layers.28.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[259/291] layers.28.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[260/291] layers.28.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[261/291] layers.28.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[262/291] layers.28.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[263/291] layers.28.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[264/291] layers.28.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[265/291] layers.29.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[266/291] layers.29.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[267/291] layers.29.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[268/291] layers.29.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[269/291] layers.29.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[270/291] layers.29.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[271/291] layers.29.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", - "[272/291] layers.29.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[273/291] layers.29.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[274/291] layers.30.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[275/291] layers.30.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[276/291] layers.30.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", - "[277/291] layers.30.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[278/291] layers.30.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[279/291] layers.30.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[280/291] layers.30.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.018 0.032 0.051 0.076 0.104 0.128 0.137 0.128 0.104 0.076 0.051 0.032 0.018 0.022 \n", - "[281/291] layers.30.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[282/291] layers.30.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[283/291] layers.31.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", - "[284/291] layers.31.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", - "[285/291] layers.31.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", - "[286/291] layers.31.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[287/291] layers.31.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "[288/291] layers.31.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[289/291] layers.31.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.021 0.018 0.031 0.050 0.075 0.104 0.130 0.140 0.130 0.104 0.075 0.050 0.031 0.018 0.021 \n", - "[290/291] layers.31.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "[291/291] layers.31.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", - "llama_model_quantize_internal: model size = 13133.55 MB\n", - "llama_model_quantize_internal: quant size = 4104.93 MB\n", - "llama_model_quantize_internal: hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", - "\n", - "main: quantize time = 178732.41 ms\n", - "main: total time = 178732.41 ms\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### (可选)测试量化模型解码\n", - "至此已完成了所有转换步骤。\n", - "我们运行一条命令测试一下是否能够正常加载并进行对话。\n", - "\n", - "FP16和Q4量化文件存放在./llama.cpp/zh-models/7B下,可按需下载使用。" - ], - "metadata": { - "id": "DLkuRAo9Vkb1" - } - }, - { - "cell_type": "code", - "source": [ - "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q4_0.bin --color -f ./prompts/alpaca.txt -p \"详细介绍一下北京的名胜古迹:\" -n 512" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tW-ep1BsVQtG", - "outputId": "0706c974-127e-4f21-be6b-d71ea4fb989b" - }, - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "main: seed = 1681467955\n", - "llama.cpp: loading model from ./zh-models/7B/ggml-model-q4_0.bin\n", - "llama_model_load_internal: format = ggjt v1 (latest)\n", - "llama_model_load_internal: n_vocab = 49954\n", - "llama_model_load_internal: n_ctx = 512\n", - "llama_model_load_internal: n_embd = 4096\n", - "llama_model_load_internal: n_mult = 256\n", - "llama_model_load_internal: n_head = 32\n", - "llama_model_load_internal: n_layer = 32\n", - "llama_model_load_internal: n_rot = 128\n", - "llama_model_load_internal: ftype = 2 (mostly Q4_0)\n", - "llama_model_load_internal: n_ff = 11008\n", - "llama_model_load_internal: n_parts = 1\n", - "llama_model_load_internal: model size = 7B\n", - "llama_model_load_internal: ggml ctx size = 59.11 KB\n", - "llama_model_load_internal: mem required = 5896.99 MB (+ 1026.00 MB per state)\n", - "llama_init_from_file: kv self size = 256.00 MB\n", - "\n", - "system_info: n_threads = 40 / 40 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n", - "sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.100000\n", - "generate: n_ctx = 512, n_batch = 8, n_predict = 512, n_keep = 0\n", - "\n", - "\n", - "\u001b[33m 详细介绍一下北京的名胜古迹:\u001b[0m\n", - " 故宫:明、清两代皇室,御花园及八达门大街。 宫殿内有大量文物珍品; [end of text]\n", - "\n", - "llama_print_timings: load time = 717.01 ms\n", - "llama_print_timings: sample time = 48.97 ms / 32 runs ( 1.53 ms per run)\n", - "llama_print_timings: prompt eval time = 680.93 ms / 11 tokens ( 61.90 ms per token)\n", - "llama_print_timings: eval time = 4490.00 ms / 31 runs ( 144.84 ms per run)\n", - "llama_print_timings: total time = 5461.05 ms\n" - ] - } - ] - } - ] -} \ No newline at end of file From 11630c43483ad41a621fa29ab40ef2af9cfc8546 Mon Sep 17 00:00:00 2001 From: ymcui Date: Thu, 15 Jun 2023 17:24:53 +0800 Subject: [PATCH 3/9] update legacy notebook --- .gitignore | 2 - ...nd_quantize_chinese_llama_and_alpaca.ipynb | 2568 +++++++++++++++++ ...ert_and_quantize_chinese_alpaca_plus.ipynb | 1171 ++++++++ .../convert_and_quantize_chinese_llama.ipynb | 1874 ++++++++++++ 4 files changed, 5613 insertions(+), 2 deletions(-) create mode 100644 notebooks/convert_and_quantize_chinese_llama_and_alpaca.ipynb create mode 100644 notebooks/legacy/convert_and_quantize_chinese_alpaca_plus.ipynb create mode 100644 notebooks/legacy/convert_and_quantize_chinese_llama.ipynb diff --git a/.gitignore b/.gitignore index 4d3a240..f0fed7e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,2 @@ .DS_Store */.DS_Store -*.ipynb -*/*.ipynb diff --git a/notebooks/convert_and_quantize_chinese_llama_and_alpaca.ipynb b/notebooks/convert_and_quantize_chinese_llama_and_alpaca.ipynb new file mode 100644 index 0000000..e15bab3 --- /dev/null +++ b/notebooks/convert_and_quantize_chinese_llama_and_alpaca.ipynb @@ -0,0 +1,2568 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm", + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 转换并量化中文LLaMA和Alpaca模型\n", + "\n", + "项目地址:https://github.com/ymcui/Chinese-LLaMA-Alpaca\n", + "\n", + "⚠️ 内存消耗提示(确保刷出来的机器RAM大于以下要求):\n", + "- 7B模型:15G+\n", + "- 13B模型:18G+\n", + "- 33B模型:22G+\n", + "\n", + "💡 提示和小窍门:\n", + "- 免费用户默认的内存只有12G左右,不足以转换模型。**实测选择TPU的话有机会随机出35G内存**,建议多试几次\n", + "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n", + "- 程序莫名崩掉或断开连接就说明内存爆了\n", + "- 如果选了“高RAM”之后内存还是不够大的话,选择以下操作,有的时候会分配出很高内存的机器,祝你好运😄!\n", + " - 可以把GPU或者TPU也选上(虽然不会用到)\n", + " - 选GPU时,Pro(+)用户可选“A100”类型GPU\n", + "\n", + "*温馨提示:用完之后注意断开运行时,选择满足要求的最低配置即可,避免不必要的计算单元消耗(Pro只给100个计算单元)。*" + ], + "metadata": { + "id": "B1c96_k3MahN" + } + }, + { + "cell_type": "markdown", + "source": [ + "## 安装相关依赖" + ], + "metadata": { + "id": "vScqHD_jMFOV" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E5WKFJXIL6ZU", + "outputId": "a7baeebb-9b74-4d14-93dc-fb1f6e1b3716" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: torch==1.13.1 in /usr/local/lib/python3.10/dist-packages (1.13.1)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (4.5.0)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (11.7.99)\n", + "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (8.5.0.96)\n", + "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (11.10.3.66)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch==1.13.1) (11.7.99)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==1.13.1) (67.7.2)\n", + "Requirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==1.13.1) (0.40.0)\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: transformers==4.30.2 in /usr/local/lib/python3.10/dist-packages (4.30.2)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (3.12.0)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (0.15.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (1.22.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (23.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (6.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (2022.10.31)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (2.27.1)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (0.13.3)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (0.3.1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.30.2) (4.65.0)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.30.2) (2023.4.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.30.2) (4.5.0)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.30.2) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.30.2) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.30.2) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.30.2) (3.4)\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting peft==0.3.0\n", + " Downloading peft-0.3.0-py3-none-any.whl (56 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.8/56.8 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (1.22.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (23.1)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (5.9.5)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (6.0)\n", + "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (1.13.1)\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0) (4.30.2)\n", + "Collecting accelerate (from peft==0.3.0)\n", + " Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.6/227.6 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (4.5.0)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (11.7.99)\n", + "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (8.5.0.96)\n", + "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (11.10.3.66)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0) (11.7.99)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0) (67.7.2)\n", + "Requirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0) (0.40.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (3.12.0)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (0.15.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (2022.10.31)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (2.27.1)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (0.13.3)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (0.3.1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0) (4.65.0)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers->peft==0.3.0) (2023.4.0)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0) (3.4)\n", + "Installing collected packages: accelerate, peft\n", + "Successfully installed accelerate-0.20.3 peft-0.3.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.99)\n" + ] + } + ], + "source": [ + "!pip install torch==1.13.1\n", + "!pip install transformers==4.30.2\n", + "!pip install peft==0.3.0\n", + "!pip install sentencepiece" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 克隆目录和代码" + ], + "metadata": { + "id": "ygb1xFIMNQKw" + } + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n", + "!git clone https://github.com/ggerganov/llama.cpp" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yCEJh7NJNXz9", + "outputId": "bfa34a83-a8b9-4e24-e956-83c7313eb448" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'Chinese-LLaMA-Alpaca'...\n", + "remote: Enumerating objects: 1407, done.\u001b[K\n", + "remote: Counting objects: 100% (599/599), done.\u001b[K\n", + "remote: Compressing objects: 100% (257/257), done.\u001b[K\n", + "remote: Total 1407 (delta 369), reused 494 (delta 338), pack-reused 808\u001b[K\n", + "Receiving objects: 100% (1407/1407), 22.61 MiB | 27.14 MiB/s, done.\n", + "Resolving deltas: 100% (831/831), done.\n", + "Cloning into 'llama.cpp'...\n", + "remote: Enumerating objects: 3618, done.\u001b[K\n", + "remote: Counting objects: 100% (1155/1155), done.\u001b[K\n", + "remote: Compressing objects: 100% (124/124), done.\u001b[K\n", + "remote: Total 3618 (delta 1076), reused 1036 (delta 1031), pack-reused 2463\u001b[K\n", + "Receiving objects: 100% (3618/3618), 3.28 MiB | 21.36 MiB/s, done.\n", + "Resolving deltas: 100% (2424/2424), done.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 合并模型(以Alpaca-7B为例)\n", + "\n", + "此处使用的是🤗模型库中提供的基模型(已是HF格式),而不是Facebook官方的LLaMA模型,因此略去将原版LLaMA转换为HF格式的步骤。\n", + "**这里直接运行第二步:合并LoRA权重**,生成全量模型权重。可以直接指定🤗模型库的地址,也可以是本地存放地址。\n", + "- 基模型:`elinas/llama-7b-hf-transformers-4.29` *(use at your own risk,我们比对过SHA256和正版一致,但你应确保自己有权使用该模型)*\n", + "- LoRA模型:`ziqingyang/chinese-alpaca-lora-7b`\n", + " - 如果是Alpaca-Plus模型,记得要同时传入llama和alpaca的lora,教程:[这里](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换#多lora权重合并适用于chinese-alpaca-plus)\n", + "- 输出格式:可选pth或者huggingface,这里选择pth,因为后面要用llama.cpp量化\n", + "\n", + "由于要下载模型,所以需要耐心等待一下,尤其是33B模型。\n", + "转换好的模型存放在`alpaca-combined`目录。\n", + "如果你不需要量化模型,那么到这一步就结束了,可自行下载或者转存到Google Drive。" + ], + "metadata": { + "id": "nIyxX0DSNsgQ" + } + }, + { + "cell_type": "code", + "source": [ + "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora_low_mem.py \\\n", + " --base_model 'elinas/llama-7b-hf-transformers-4.29' \\\n", + " --lora_model 'ziqingyang/chinese-alpaca-lora-7b' \\\n", + " --output_type pth \\\n", + " --output_dir alpaca-combined" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5AV4EW5hNhVV", + "outputId": "5cb36099-4ca1-403e-c6b5-c8c8441eaa11" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Base model: elinas/llama-7b-hf-transformers-4.29\n", + "LoRA model(s) ['ziqingyang/chinese-alpaca-lora-7b']:\n", + "Loading ziqingyang/chinese-alpaca-lora-7b\n", + "Cannot find lora model on the disk. Downloading lora model from hub...\n", + "Fetching 7 files: 0% 0/7 [00:00 109.76 MB | hist: \n", + "[ 2/ 291] norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 3/ 291] output.weight - 4096 x 49954, type = f16, quantizing .. size = 390.27 MB -> 160.07 MB | hist: \n", + "[ 4/ 291] layers.0.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 5/ 291] layers.0.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 6/ 291] layers.0.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 7/ 291] layers.0.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 8/ 291] layers.0.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 9/ 291] layers.0.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 10/ 291] layers.0.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 11/ 291] layers.0.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 12/ 291] layers.0.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 13/ 291] layers.1.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 14/ 291] layers.1.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 15/ 291] layers.1.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 16/ 291] layers.1.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 17/ 291] layers.1.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 18/ 291] layers.1.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 19/ 291] layers.1.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 20/ 291] layers.1.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 21/ 291] layers.1.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 22/ 291] layers.2.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 23/ 291] layers.2.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 24/ 291] layers.2.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 25/ 291] layers.2.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 26/ 291] layers.2.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 27/ 291] layers.2.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 28/ 291] layers.2.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 29/ 291] layers.2.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 30/ 291] layers.2.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 31/ 291] layers.3.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 32/ 291] layers.3.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 33/ 291] layers.3.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 34/ 291] layers.3.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 35/ 291] layers.3.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 36/ 291] layers.3.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 37/ 291] layers.3.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 38/ 291] layers.3.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 39/ 291] layers.3.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 40/ 291] layers.4.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 41/ 291] layers.4.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 42/ 291] layers.4.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 43/ 291] layers.4.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 44/ 291] layers.4.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 45/ 291] layers.4.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 46/ 291] layers.4.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 47/ 291] layers.4.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 48/ 291] layers.4.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 49/ 291] layers.5.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 50/ 291] layers.5.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 51/ 291] layers.5.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 52/ 291] layers.5.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 53/ 291] layers.5.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 54/ 291] layers.5.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 55/ 291] layers.5.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 56/ 291] layers.5.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 57/ 291] layers.5.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 58/ 291] layers.6.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 59/ 291] layers.6.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 60/ 291] layers.6.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 61/ 291] layers.6.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 62/ 291] layers.6.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 63/ 291] layers.6.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 64/ 291] layers.6.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 65/ 291] layers.6.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 66/ 291] layers.6.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 67/ 291] layers.7.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 68/ 291] layers.7.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 69/ 291] layers.7.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 70/ 291] layers.7.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 71/ 291] layers.7.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 72/ 291] layers.7.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 73/ 291] layers.7.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 74/ 291] layers.7.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 75/ 291] layers.7.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 76/ 291] layers.8.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 77/ 291] layers.8.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 78/ 291] layers.8.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 79/ 291] layers.8.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 80/ 291] layers.8.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 81/ 291] layers.8.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 82/ 291] layers.8.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 83/ 291] layers.8.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 84/ 291] layers.8.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 85/ 291] layers.9.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 86/ 291] layers.9.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 87/ 291] layers.9.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 88/ 291] layers.9.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 89/ 291] layers.9.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 90/ 291] layers.9.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 91/ 291] layers.9.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 92/ 291] layers.9.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 93/ 291] layers.9.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 94/ 291] layers.10.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 95/ 291] layers.10.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 96/ 291] layers.10.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 97/ 291] layers.10.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 98/ 291] layers.10.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 99/ 291] layers.10.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 100/ 291] layers.10.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 101/ 291] layers.10.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 102/ 291] layers.10.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 103/ 291] layers.11.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 104/ 291] layers.11.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 105/ 291] layers.11.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 106/ 291] layers.11.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 107/ 291] layers.11.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 108/ 291] layers.11.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 109/ 291] layers.11.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 110/ 291] layers.11.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 111/ 291] layers.11.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 112/ 291] layers.12.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 113/ 291] layers.12.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 114/ 291] layers.12.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 115/ 291] layers.12.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 116/ 291] layers.12.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 117/ 291] layers.12.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 118/ 291] layers.12.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 119/ 291] layers.12.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 120/ 291] layers.12.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 121/ 291] layers.13.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 122/ 291] layers.13.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 123/ 291] layers.13.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 124/ 291] layers.13.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 125/ 291] layers.13.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 126/ 291] layers.13.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 127/ 291] layers.13.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 128/ 291] layers.13.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 129/ 291] layers.13.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 130/ 291] layers.14.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 131/ 291] layers.14.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 132/ 291] layers.14.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 133/ 291] layers.14.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 134/ 291] layers.14.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 135/ 291] layers.14.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 136/ 291] layers.14.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 137/ 291] layers.14.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 138/ 291] layers.14.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 139/ 291] layers.15.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 140/ 291] layers.15.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 141/ 291] layers.15.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 142/ 291] layers.15.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 143/ 291] layers.15.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 144/ 291] layers.15.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 145/ 291] layers.15.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 146/ 291] layers.15.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 147/ 291] layers.15.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 148/ 291] layers.16.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 149/ 291] layers.16.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 150/ 291] layers.16.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 151/ 291] layers.16.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 152/ 291] layers.16.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 153/ 291] layers.16.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 154/ 291] layers.16.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 155/ 291] layers.16.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 156/ 291] layers.16.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 157/ 291] layers.17.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 158/ 291] layers.17.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 159/ 291] layers.17.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 160/ 291] layers.17.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 161/ 291] layers.17.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 162/ 291] layers.17.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 163/ 291] layers.17.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 164/ 291] layers.17.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 165/ 291] layers.17.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 166/ 291] layers.18.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 167/ 291] layers.18.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 168/ 291] layers.18.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 169/ 291] layers.18.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 170/ 291] layers.18.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 171/ 291] layers.18.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 172/ 291] layers.18.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 173/ 291] layers.18.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 174/ 291] layers.18.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 175/ 291] layers.19.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 176/ 291] layers.19.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 177/ 291] layers.19.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 178/ 291] layers.19.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 179/ 291] layers.19.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 180/ 291] layers.19.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 181/ 291] layers.19.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 182/ 291] layers.19.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 183/ 291] layers.19.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 184/ 291] layers.20.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 185/ 291] layers.20.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 186/ 291] layers.20.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 187/ 291] layers.20.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 188/ 291] layers.20.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 189/ 291] layers.20.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 190/ 291] layers.20.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 191/ 291] layers.20.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 192/ 291] layers.20.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 193/ 291] layers.21.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 194/ 291] layers.21.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 195/ 291] layers.21.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 196/ 291] layers.21.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 197/ 291] layers.21.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 198/ 291] layers.21.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 199/ 291] layers.21.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 200/ 291] layers.21.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 201/ 291] layers.21.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 202/ 291] layers.22.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 203/ 291] layers.22.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 204/ 291] layers.22.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 205/ 291] layers.22.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 206/ 291] layers.22.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 207/ 291] layers.22.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 208/ 291] layers.22.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 209/ 291] layers.22.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 210/ 291] layers.22.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 211/ 291] layers.23.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 212/ 291] layers.23.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 213/ 291] layers.23.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 214/ 291] layers.23.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 215/ 291] layers.23.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 216/ 291] layers.23.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 217/ 291] layers.23.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 218/ 291] layers.23.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 219/ 291] layers.23.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 220/ 291] layers.24.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 221/ 291] layers.24.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 222/ 291] layers.24.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 223/ 291] layers.24.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 224/ 291] layers.24.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 225/ 291] layers.24.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 226/ 291] layers.24.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 227/ 291] layers.24.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 228/ 291] layers.24.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 229/ 291] layers.25.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 230/ 291] layers.25.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 231/ 291] layers.25.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 232/ 291] layers.25.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 233/ 291] layers.25.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 234/ 291] layers.25.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 235/ 291] layers.25.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 236/ 291] layers.25.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 237/ 291] layers.25.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 238/ 291] layers.26.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 239/ 291] layers.26.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 240/ 291] layers.26.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 241/ 291] layers.26.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 242/ 291] layers.26.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 243/ 291] layers.26.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 244/ 291] layers.26.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 245/ 291] layers.26.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 246/ 291] layers.26.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 247/ 291] layers.27.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 248/ 291] layers.27.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 249/ 291] layers.27.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 250/ 291] layers.27.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 251/ 291] layers.27.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 252/ 291] layers.27.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 253/ 291] layers.27.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 254/ 291] layers.27.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 255/ 291] layers.27.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 256/ 291] layers.28.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 257/ 291] layers.28.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 258/ 291] layers.28.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 259/ 291] layers.28.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 260/ 291] layers.28.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 261/ 291] layers.28.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 262/ 291] layers.28.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 263/ 291] layers.28.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 264/ 291] layers.28.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 265/ 291] layers.29.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 266/ 291] layers.29.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 267/ 291] layers.29.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 268/ 291] layers.29.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 269/ 291] layers.29.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 270/ 291] layers.29.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 271/ 291] layers.29.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 272/ 291] layers.29.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 273/ 291] layers.29.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 274/ 291] layers.30.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 275/ 291] layers.30.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 276/ 291] layers.30.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 277/ 291] layers.30.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 278/ 291] layers.30.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 279/ 291] layers.30.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 280/ 291] layers.30.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 281/ 291] layers.30.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 282/ 291] layers.30.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 283/ 291] layers.31.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 284/ 291] layers.31.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 285/ 291] layers.31.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 13.12 MB | hist: \n", + "[ 286/ 291] layers.31.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 287/ 291] layers.31.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 288/ 291] layers.31.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 289/ 291] layers.31.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 35.27 MB | hist: \n", + "[ 290/ 291] layers.31.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 291/ 291] layers.31.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "llama_model_quantize_internal: model size = 13133.55 MB\n", + "llama_model_quantize_internal: quant size = 3988.22 MB\n", + "\n", + "main: quantize time = 153421.48 ms\n", + "main: total time = 153421.48 ms\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### (可选)测试量化模型解码\n", + "至此已完成了所有转换步骤。\n", + "我们运行一条命令测试一下是否能够正常加载并进行对话。\n", + "\n", + "FP16和Q4量化文件存放在./llama.cpp/zh-models/7B下,可按需下载使用。" + ], + "metadata": { + "id": "DLkuRAo9Vkb1" + } + }, + { + "cell_type": "code", + "source": [ + "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q4_K.bin --color -p \"详细介绍一下北京的名胜古迹:\" -n 128" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tW-ep1BsVQtG", + "outputId": "03f0343f-3b7c-490e-a0ab-6724d79c5dc8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "main: build = 670 (254a7a7)\n", + "main: seed = 1686819449\n", + "llama.cpp: loading model from ./zh-models/7B/ggml-model-q4_K.bin\n", + "llama_model_load_internal: format = ggjt v3 (latest)\n", + "llama_model_load_internal: n_vocab = 49954\n", + "llama_model_load_internal: n_ctx = 512\n", + "llama_model_load_internal: n_embd = 4096\n", + "llama_model_load_internal: n_mult = 256\n", + "llama_model_load_internal: n_head = 32\n", + "llama_model_load_internal: n_layer = 32\n", + "llama_model_load_internal: n_rot = 128\n", + "llama_model_load_internal: ftype = 15 (mostly Q4_K - Medium)\n", + "llama_model_load_internal: n_ff = 11008\n", + "llama_model_load_internal: n_parts = 1\n", + "llama_model_load_internal: model size = 7B\n", + "llama_model_load_internal: ggml ctx size = 0.07 MB\n", + "llama_model_load_internal: mem required = 5780.29 MB (+ 1026.00 MB per state)\n", + "................................................................................................\n", + "llama_init_from_file: kv self size = 256.00 MB\n", + "\n", + "system_info: n_threads = 4 / 4 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n", + "sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000\n", + "generate: n_ctx = 512, n_batch = 512, n_predict = 128, n_keep = 0\n", + "\n", + "\n", + "\u001b[33m 详细介绍一下北京的名胜古迹:\u001b[0m天安门、故宫、颐和园、圆明园、北海公园等。 参观后你一定会爱上这座城市! [end of text]\n", + "\n", + "llama_print_timings: load time = 16410.24 ms\n", + "llama_print_timings: sample time = 30.04 ms / 30 runs ( 1.00 ms per token)\n", + "llama_print_timings: prompt eval time = 3479.21 ms / 11 tokens ( 316.29 ms per token)\n", + "llama_print_timings: eval time = 10516.40 ms / 29 runs ( 362.63 ms per token)\n", + "llama_print_timings: total time = 14042.46 ms\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/notebooks/legacy/convert_and_quantize_chinese_alpaca_plus.ipynb b/notebooks/legacy/convert_and_quantize_chinese_alpaca_plus.ipynb new file mode 100644 index 0000000..b3bf1e3 --- /dev/null +++ b/notebooks/legacy/convert_and_quantize_chinese_alpaca_plus.ipynb @@ -0,0 +1,1171 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "B1c96_k3MahN" + }, + "source": [ + "# 转换并量化中文Alpaca Plus模型\n", + "\n", + "关于其他模型请参考另一个notebook:https://colab.research.google.com/drive/1Eak6azD3MLeb-YsfbP8UZC8wrL1ddIMI?usp=sharing\n", + "\n", + "\n", + "🎉🎉🎉 **新:现在免费用户也有机会能够转换7B和13B模型了!**\n", + "\n", + "💡 提示和小窍门:\n", + "- 免费用户默认的内存只有12G左右,**笔者用免费账号实测选择TPU的话有机会随机出35G内存**,建议多试几次。如果能随机出25G内存以上的机器就可以了转换7B模型了,35G内存以上机器就能转换13B模型了\n", + "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n", + "- 实测:转换7B级别模型,25G内存的机器就够了;转换13B级别模型需要30G以上的内存(程序莫名崩掉或断开连接就说明内存爆了)\n", + "- 如果选了“高RAM”之后内存还是不够大的话,选择以下操作,有的时候会分配出很高内存的机器,祝你好运😄!\n", + " - 可以把GPU或者TPU也选上(虽然不会用到)\n", + " - 选GPU时,Pro用户可选“高级”类型GPU\n", + "\n", + "以下信息配置信息供参考(Pro订阅下测试),运行时规格设置为“高RAM”时的设备配置如下(有随机性):\n", + "\n", + "| 硬件加速器 | RAM | 硬盘 |\n", + "| :-- | :--: | :--: |\n", + "| None | 25GB | 225GB |\n", + "| TPU | 35GB | 225GB |\n", + "| GPU(标准,T4)| 25GB | 166GB |\n", + "| GPU(高性能,V100)| 25GB | 166GB |\n", + "| GPU(高性能,A100)| **80GB** | 166GB |\n", + "\n", + "*温馨提示:用完之后注意断开运行时,选择满足要求的最低配置即可,避免不必要的计算单元消耗(Pro只给100个计算单元)。*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vScqHD_jMFOV" + }, + "source": [ + "## 安装相关依赖" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E5WKFJXIL6ZU", + "outputId": "87a89bed-053e-4e61-e2f8-1dfcbdf87fbf" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting torch==1.12.0\n", + " Downloading torch-1.12.0-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m776.3/776.3 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==1.12.0) (4.5.0)\n", + "Installing collected packages: torch\n", + " Attempting uninstall: torch\n", + " Found existing installation: torch 2.0.0+cu118\n", + " Uninstalling torch-2.0.0+cu118:\n", + " Successfully uninstalled torch-2.0.0+cu118\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "torchvision 0.15.1+cu118 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n", + "torchtext 0.15.1 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n", + "torchdata 0.6.0 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n", + "torchaudio 2.0.1+cu118 requires torch==2.0.0, but you have torch 1.12.0 which is incompatible.\n", + "peft 0.2.0 requires torch>=1.13.0, but you have torch 1.12.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed torch-1.12.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.28.1)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (2023.4.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting git+https://github.com/huggingface/peft\n", + " Cloning https://github.com/huggingface/peft to /tmp/pip-req-build-tnxzt7q0\n", + " Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft /tmp/pip-req-build-tnxzt7q0\n", + " Resolved https://github.com/huggingface/peft to commit 632997d1fb776c3cf05d8c2537ac9a98a7ce9435\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (23.1)\n", + "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (0.18.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (1.22.4)\n", + "Collecting torch>=1.13.0\n", + " Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m619.9/619.9 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (6.0)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (5.9.5)\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (4.28.1)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1)\n", + "Collecting nvidia-cufft-cu11==10.9.0.58\n", + " Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.4/168.4 MB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cudnn-cu11==8.5.0.96\n", + " Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m557.1/557.1 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (2.0.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.12.0)\n", + "Collecting nvidia-cuda-runtime-cu11==11.7.99\n", + " Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m849.3/849.3 kB\u001b[0m \u001b[31m48.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1.2)\n", + "Collecting nvidia-nccl-cu11==2.14.3\n", + " Downloading nvidia_nccl_cu11-2.14.3-py3-none-manylinux1_x86_64.whl (177.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.1/177.1 MB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (1.11.1)\n", + "Collecting nvidia-cusparse-cu11==11.7.4.91\n", + " Downloading nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m173.2/173.2 MB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cublas-cu11==11.10.3.66\n", + " Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.1/317.1 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nvtx-cu11==11.7.91\n", + " Downloading nvidia_nvtx_cu11-11.7.91-py3-none-manylinux1_x86_64.whl (98 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.6/98.6 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (4.5.0)\n", + "Collecting nvidia-curand-cu11==10.2.10.91\n", + " Downloading nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.6/54.6 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusolver-cu11==11.4.0.1\n", + " Downloading nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.6/102.6 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99\n", + " Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.0/21.0 MB\u001b[0m \u001b[31m63.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101\n", + " Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.8/11.8 MB\u001b[0m \u001b[31m75.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0.dev0) (0.40.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.3.0.dev0) (67.7.2)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (3.25.2)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (16.0.2)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.14.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2022.10.31)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.13.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (4.65.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2.27.1)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers->peft==0.3.0.dev0) (2023.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.13.0->peft==0.3.0.dev0) (2.1.2)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2.0.12)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2022.12.7)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (3.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (1.26.15)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.13.0->peft==0.3.0.dev0) (1.3.0)\n", + "Building wheels for collected packages: peft\n", + " Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for peft: filename=peft-0.3.0.dev0-py3-none-any.whl size=55537 sha256=3cc2a65c09926ac217ac671b7d9c1640eac9857f0aca55b78a9fcda484263073\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-1rjlvx70/wheels/4c/16/67/1002a2d4daa822eff130e6d85b90051b75d2ce0d26b9448e4a\n", + "Successfully built peft\n", + "Installing collected packages: nvidia-nvtx-cu11, nvidia-nccl-cu11, nvidia-cusparse-cu11, nvidia-curand-cu11, nvidia-cufft-cu11, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-cupti-cu11, nvidia-cublas-cu11, nvidia-cusolver-cu11, nvidia-cudnn-cu11, torch, peft\n", + " Attempting uninstall: torch\n", + " Found existing installation: torch 1.12.0\n", + " Uninstalling torch-1.12.0:\n", + " Successfully uninstalled torch-1.12.0\n", + " Attempting uninstall: peft\n", + " Found existing installation: peft 0.2.0\n", + " Uninstalling peft-0.2.0:\n", + " Successfully uninstalled peft-0.2.0\n", + "Successfully installed nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-cupti-cu11-11.7.101 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.2.10.91 nvidia-cusolver-cu11-11.4.0.1 nvidia-cusparse-cu11-11.7.4.91 nvidia-nccl-cu11-2.14.3 nvidia-nvtx-cu11-11.7.91 peft-0.3.0.dev0 torch-2.0.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.98)\n" + ] + } + ], + "source": [ + "!pip install torch==1.12.0\n", + "!pip install transformers\n", + "!pip install git+https://github.com/huggingface/peft\n", + "!pip install sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ygb1xFIMNQKw" + }, + "source": [ + "## 克隆目录和代码" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yCEJh7NJNXz9", + "outputId": "ec16f31b-7af7-4eb8-82ce-5f9317bad941" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'Chinese-LLaMA-Alpaca'...\n", + "remote: Enumerating objects: 761, done.\u001b[K\n", + "remote: Counting objects: 100% (202/202), done.\u001b[K\n", + "remote: Compressing objects: 100% (172/172), done.\u001b[K\n", + "remote: Total 761 (delta 54), reused 69 (delta 29), pack-reused 559\u001b[K\n", + "Receiving objects: 100% (761/761), 11.16 MiB | 22.49 MiB/s, done.\n", + "Resolving deltas: 100% (444/444), done.\n", + "Cloning into 'llama.cpp'...\n", + "remote: Enumerating objects: 2086, done.\u001b[K\n", + "remote: Counting objects: 100% (842/842), done.\u001b[K\n", + "remote: Compressing objects: 100% (99/99), done.\u001b[K\n", + "remote: Total 2086 (delta 778), reused 756 (delta 743), pack-reused 1244\u001b[K\n", + "Receiving objects: 100% (2086/2086), 2.12 MiB | 16.33 MiB/s, done.\n", + "Resolving deltas: 100% (1345/1345), done.\n" + ] + } + ], + "source": [ + "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n", + "!git clone https://github.com/ggerganov/llama.cpp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nIyxX0DSNsgQ" + }, + "source": [ + "## 合并模型(Alpaca-Plus-7B)\n", + "\n", + "**⚠️ 再次提醒:7B模型需要25G内存,13B模型需要35G+内存。**\n", + "\n", + "此处使用的是🤗模型库中提供的基模型(已是HF格式),而不是Facebook官方的LLaMA模型,因此略去将原版LLaMA转换为HF格式的步骤。\n", + "\n", + "**这里直接运行第二步:合并LoRA权重**,生成全量模型权重。可以直接指定🤗模型库的地址,也可以是本地存放地址。\n", + "- 基模型:`decapoda-research/llama-7b-hf` *(use at your own risk)*\n", + "- LoRA模型:先写`ziqingyang/chinese-llama-plus-lora-7b`然后再写`ziqingyang/chinese-alpaca-plus-lora-7b`\n", + "- 输出类型:因为后续要量化,这里将`output_type`设置为`pth`\n", + "\n", + "💡 转换13B模型提示:\n", + "- 请将参数`--base_model`和`--lora_model`中的的`7b`改为`13b`即可\n", + "- **免费用户必须增加一个参数`--offload_dir`以缓解内存压力**,例如`--offload_dir ./offload_temp`\n", + "\n", + "该过程比较耗时(下载+转换),需要几分钟到十几分钟不等,请耐心等待。\n", + "转换好的模型存放在`alpaca-combined`目录。\n", + "如果你不需要量化模型,那么到这一步就结束了。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5AV4EW5hNhVV", + "outputId": "91901b82-88c4-405d-cf86-32f1a3a60467" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-04-28 08:07:00.276520: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "Base model: decapoda-research/llama-7b-hf\n", + "LoRA model(s) ['ziqingyang/chinese-llama-plus-lora-7b', 'ziqingyang/chinese-alpaca-plus-lora-7b']:\n", + "Loading checkpoint shards: 100% 33/33 [01:18<00:00, 2.39s/it]\n", + "Peft version: 0.3.0.dev0\n", + "Loading LoRA for 7B model\n", + "Loading LoRA ziqingyang/chinese-llama-plus-lora-7b\n", + "Extended vocabulary size to 49953\n", + "Downloading (…)/adapter_config.json: 100% 420/420 [00:00<00:00, 1.61MB/s]\n", + "Downloading adapter_model.bin: 100% 858M/858M [00:04<00:00, 185MB/s]\n", + "Merging with merge_and_unload...\n", + "Loading LoRA ziqingyang/chinese-alpaca-plus-lora-7b\n", + "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 13.4MB/s]\n", + "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 535kB/s]\n", + "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 854kB/s]\n", + "Extended vocabulary size to 49954\n", + "Downloading (…)/adapter_config.json: 100% 423/423 [00:00<00:00, 2.31MB/s]\n", + "Downloading adapter_model.bin: 100% 1.14G/1.14G [00:16<00:00, 70.6MB/s]\n", + "Merging with merge_and_unload...\n", + "Saving to pth format...\n", + "Saving shard 1 of 1 into alpaca-combined/consolidated.00.pth\n" + ] + } + ], + "source": [ + "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora.py \\\n", + " --base_model decapoda-research/llama-7b-hf \\\n", + " --lora_model ziqingyang/chinese-llama-plus-lora-7b,ziqingyang/chinese-alpaca-plus-lora-7b \\\n", + " --output_type pth \\\n", + " --output_dir alpaca-combined" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ueexcKo-Q_EW" + }, + "source": [ + "## 量化模型\n", + "接下来我们使用[llama.cpp](https://github.com/ggerganov/llama.cpp)工具对上一步生成的全量版本权重进行转换,生成4-bit量化模型。\n", + "\n", + "### 编译工具\n", + "\n", + "首先对llama.cpp工具进行编译。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_GbjsT2wRRCR", + "outputId": "2b4f2a38-d22d-4764-9a81-bad8bd72b7fe" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "I llama.cpp build info: \n", + "I UNAME_S: Linux\n", + "I UNAME_P: x86_64\n", + "I UNAME_M: x86_64\n", + "I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native\n", + "I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native\n", + "I LDFLAGS: \n", + "I CC: cc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + "I CXX: g++ (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n", + "\n", + "cc -I. -O3 -DNDEBUG -std=c11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -march=native -mtune=native -c ggml.c -o ggml.o\n", + "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c llama.cpp -o llama.o\n", + "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native -c examples/common.cpp -o common.o\n", + "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/main/main.cpp ggml.o llama.o common.o -o main \n", + "\n", + "==== Run ./main -h for help. ====\n", + "\n", + "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize/quantize.cpp ggml.o llama.o -o quantize \n", + "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats \n", + "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity \n", + "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding \n", + "g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -march=native -mtune=native pocs/vdot/vdot.cpp ggml.o -o vdot \n" + ] + } + ], + "source": [ + "!cd llama.cpp && make" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gw2xpYC0RcQC" + }, + "source": [ + "### 模型转换为ggml格式(FP16)\n", + "\n", + "这一步,我们将模型转换为ggml格式(FP16)。\n", + "- 在这之前需要把`alpaca-combined`目录挪个位置,把模型文件放到`llama.cpp/zh-models/7B`下,把`tokenizer.model`放到`llama.cpp/zh-models`\n", + "- tokenizer在哪里?\n", + " - `alpaca-combined`目录下有\n", + " - 或者从以下网址下载:https://huggingface.co/ziqingyang/chinese-alpaca-lora-7b/resolve/main/tokenizer.model (注意,Alpaca和LLaMA的`tokenizer.model`不能混用!)\n", + "\n", + "💡 转换13B模型提示:\n", + "- tokenizer可以直接用7B的,13B和7B的相同\n", + "- Alpaca和LLaMA的`tokenizer.model`不能混用!\n", + "- 以下看到7B字样的都是文件夹名,与转换过程没有关系了,改不改都行" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5KgnFVStRjio", + "outputId": "19293a4a-a400-4cd3-c98b-80022dcd1f35" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "7B tokenizer.model\n" + ] + } + ], + "source": [ + "!cd llama.cpp && mkdir zh-models && mv ../alpaca-combined zh-models/7B\n", + "!mv llama.cpp/zh-models/7B/tokenizer.model llama.cpp/zh-models/\n", + "!ls llama.cpp/zh-models/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NUHeoTMQS1AQ", + "outputId": "378b70db-d13b-4aa9-8bb0-a1fc1cd4b13f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading model file zh-models/7B/consolidated.00.pth\n", + "Loading vocab file zh-models/tokenizer.model\n", + "Writing vocab...\n", + "[ 1/291] Writing tensor tok_embeddings.weight | size 49954 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 2/291] Writing tensor norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 3/291] Writing tensor output.weight | size 49954 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 4/291] Writing tensor layers.0.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 5/291] Writing tensor layers.0.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 6/291] Writing tensor layers.0.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 7/291] Writing tensor layers.0.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 8/291] Writing tensor layers.0.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 9/291] Writing tensor layers.0.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 10/291] Writing tensor layers.0.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 11/291] Writing tensor layers.0.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 12/291] Writing tensor layers.0.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 13/291] Writing tensor layers.1.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 14/291] Writing tensor layers.1.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 15/291] Writing tensor layers.1.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 16/291] Writing tensor layers.1.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 17/291] Writing tensor layers.1.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 18/291] Writing tensor layers.1.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 19/291] Writing tensor layers.1.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 20/291] Writing tensor layers.1.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 21/291] Writing tensor layers.1.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 22/291] Writing tensor layers.2.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 23/291] Writing tensor layers.2.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 24/291] Writing tensor layers.2.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 25/291] Writing tensor layers.2.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 26/291] Writing tensor layers.2.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 27/291] Writing tensor layers.2.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 28/291] Writing tensor layers.2.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 29/291] Writing tensor layers.2.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 30/291] Writing tensor layers.2.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 31/291] Writing tensor layers.3.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 32/291] Writing tensor layers.3.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 33/291] Writing tensor layers.3.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 34/291] Writing tensor layers.3.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 35/291] Writing tensor layers.3.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 36/291] Writing tensor layers.3.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 37/291] Writing tensor layers.3.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 38/291] Writing tensor layers.3.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 39/291] Writing tensor layers.3.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 40/291] Writing tensor layers.4.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 41/291] Writing tensor layers.4.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 42/291] Writing tensor layers.4.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 43/291] Writing tensor layers.4.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 44/291] Writing tensor layers.4.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 45/291] Writing tensor layers.4.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 46/291] Writing tensor layers.4.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 47/291] Writing tensor layers.4.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 48/291] Writing tensor layers.4.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 49/291] Writing tensor layers.5.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 50/291] Writing tensor layers.5.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 51/291] Writing tensor layers.5.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 52/291] Writing tensor layers.5.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 53/291] Writing tensor layers.5.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 54/291] Writing tensor layers.5.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 55/291] Writing tensor layers.5.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 56/291] Writing tensor layers.5.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 57/291] Writing tensor layers.5.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 58/291] Writing tensor layers.6.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 59/291] Writing tensor layers.6.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 60/291] Writing tensor layers.6.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 61/291] Writing tensor layers.6.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 62/291] Writing tensor layers.6.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 63/291] Writing tensor layers.6.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 64/291] Writing tensor layers.6.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 65/291] Writing tensor layers.6.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 66/291] Writing tensor layers.6.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 67/291] Writing tensor layers.7.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 68/291] Writing tensor layers.7.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 69/291] Writing tensor layers.7.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 70/291] Writing tensor layers.7.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 71/291] Writing tensor layers.7.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 72/291] Writing tensor layers.7.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 73/291] Writing tensor layers.7.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 74/291] Writing tensor layers.7.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 75/291] Writing tensor layers.7.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 76/291] Writing tensor layers.8.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 77/291] Writing tensor layers.8.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 78/291] Writing tensor layers.8.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 79/291] Writing tensor layers.8.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 80/291] Writing tensor layers.8.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 81/291] Writing tensor layers.8.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 82/291] Writing tensor layers.8.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 83/291] Writing tensor layers.8.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 84/291] Writing tensor layers.8.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 85/291] Writing tensor layers.9.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 86/291] Writing tensor layers.9.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 87/291] Writing tensor layers.9.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 88/291] Writing tensor layers.9.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 89/291] Writing tensor layers.9.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 90/291] Writing tensor layers.9.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 91/291] Writing tensor layers.9.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[ 92/291] Writing tensor layers.9.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 93/291] Writing tensor layers.9.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 94/291] Writing tensor layers.10.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 95/291] Writing tensor layers.10.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 96/291] Writing tensor layers.10.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 97/291] Writing tensor layers.10.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[ 98/291] Writing tensor layers.10.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[ 99/291] Writing tensor layers.10.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[100/291] Writing tensor layers.10.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[101/291] Writing tensor layers.10.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[102/291] Writing tensor layers.10.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[103/291] Writing tensor layers.11.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[104/291] Writing tensor layers.11.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[105/291] Writing tensor layers.11.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[106/291] Writing tensor layers.11.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[107/291] Writing tensor layers.11.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[108/291] Writing tensor layers.11.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[109/291] Writing tensor layers.11.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[110/291] Writing tensor layers.11.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[111/291] Writing tensor layers.11.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[112/291] Writing tensor layers.12.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[113/291] Writing tensor layers.12.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[114/291] Writing tensor layers.12.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[115/291] Writing tensor layers.12.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[116/291] Writing tensor layers.12.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[117/291] Writing tensor layers.12.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[118/291] Writing tensor layers.12.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[119/291] Writing tensor layers.12.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[120/291] Writing tensor layers.12.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[121/291] Writing tensor layers.13.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[122/291] Writing tensor layers.13.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[123/291] Writing tensor layers.13.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[124/291] Writing tensor layers.13.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[125/291] Writing tensor layers.13.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[126/291] Writing tensor layers.13.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[127/291] Writing tensor layers.13.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[128/291] Writing tensor layers.13.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[129/291] Writing tensor layers.13.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[130/291] Writing tensor layers.14.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[131/291] Writing tensor layers.14.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[132/291] Writing tensor layers.14.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[133/291] Writing tensor layers.14.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[134/291] Writing tensor layers.14.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[135/291] Writing tensor layers.14.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[136/291] Writing tensor layers.14.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[137/291] Writing tensor layers.14.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[138/291] Writing tensor layers.14.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[139/291] Writing tensor layers.15.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[140/291] Writing tensor layers.15.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[141/291] Writing tensor layers.15.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[142/291] Writing tensor layers.15.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[143/291] Writing tensor layers.15.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[144/291] Writing tensor layers.15.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[145/291] Writing tensor layers.15.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[146/291] Writing tensor layers.15.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[147/291] Writing tensor layers.15.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[148/291] Writing tensor layers.16.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[149/291] Writing tensor layers.16.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[150/291] Writing tensor layers.16.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[151/291] Writing tensor layers.16.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[152/291] Writing tensor layers.16.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[153/291] Writing tensor layers.16.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[154/291] Writing tensor layers.16.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[155/291] Writing tensor layers.16.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[156/291] Writing tensor layers.16.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[157/291] Writing tensor layers.17.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[158/291] Writing tensor layers.17.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[159/291] Writing tensor layers.17.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[160/291] Writing tensor layers.17.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[161/291] Writing tensor layers.17.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[162/291] Writing tensor layers.17.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[163/291] Writing tensor layers.17.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[164/291] Writing tensor layers.17.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[165/291] Writing tensor layers.17.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[166/291] Writing tensor layers.18.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[167/291] Writing tensor layers.18.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[168/291] Writing tensor layers.18.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[169/291] Writing tensor layers.18.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[170/291] Writing tensor layers.18.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[171/291] Writing tensor layers.18.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[172/291] Writing tensor layers.18.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[173/291] Writing tensor layers.18.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[174/291] Writing tensor layers.18.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[175/291] Writing tensor layers.19.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[176/291] Writing tensor layers.19.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[177/291] Writing tensor layers.19.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[178/291] Writing tensor layers.19.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[179/291] Writing tensor layers.19.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[180/291] Writing tensor layers.19.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[181/291] Writing tensor layers.19.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[182/291] Writing tensor layers.19.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[183/291] Writing tensor layers.19.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[184/291] Writing tensor layers.20.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[185/291] Writing tensor layers.20.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[186/291] Writing tensor layers.20.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[187/291] Writing tensor layers.20.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[188/291] Writing tensor layers.20.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[189/291] Writing tensor layers.20.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[190/291] Writing tensor layers.20.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[191/291] Writing tensor layers.20.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[192/291] Writing tensor layers.20.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[193/291] Writing tensor layers.21.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[194/291] Writing tensor layers.21.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[195/291] Writing tensor layers.21.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[196/291] Writing tensor layers.21.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[197/291] Writing tensor layers.21.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[198/291] Writing tensor layers.21.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[199/291] Writing tensor layers.21.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[200/291] Writing tensor layers.21.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[201/291] Writing tensor layers.21.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[202/291] Writing tensor layers.22.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[203/291] Writing tensor layers.22.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[204/291] Writing tensor layers.22.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[205/291] Writing tensor layers.22.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[206/291] Writing tensor layers.22.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[207/291] Writing tensor layers.22.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[208/291] Writing tensor layers.22.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[209/291] Writing tensor layers.22.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[210/291] Writing tensor layers.22.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[211/291] Writing tensor layers.23.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[212/291] Writing tensor layers.23.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[213/291] Writing tensor layers.23.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[214/291] Writing tensor layers.23.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[215/291] Writing tensor layers.23.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[216/291] Writing tensor layers.23.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[217/291] Writing tensor layers.23.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[218/291] Writing tensor layers.23.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[219/291] Writing tensor layers.23.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[220/291] Writing tensor layers.24.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[221/291] Writing tensor layers.24.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[222/291] Writing tensor layers.24.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[223/291] Writing tensor layers.24.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[224/291] Writing tensor layers.24.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[225/291] Writing tensor layers.24.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[226/291] Writing tensor layers.24.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[227/291] Writing tensor layers.24.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[228/291] Writing tensor layers.24.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[229/291] Writing tensor layers.25.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[230/291] Writing tensor layers.25.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[231/291] Writing tensor layers.25.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[232/291] Writing tensor layers.25.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[233/291] Writing tensor layers.25.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[234/291] Writing tensor layers.25.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[235/291] Writing tensor layers.25.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[236/291] Writing tensor layers.25.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[237/291] Writing tensor layers.25.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[238/291] Writing tensor layers.26.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[239/291] Writing tensor layers.26.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[240/291] Writing tensor layers.26.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[241/291] Writing tensor layers.26.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[242/291] Writing tensor layers.26.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[243/291] Writing tensor layers.26.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[244/291] Writing tensor layers.26.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[245/291] Writing tensor layers.26.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[246/291] Writing tensor layers.26.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[247/291] Writing tensor layers.27.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[248/291] Writing tensor layers.27.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[249/291] Writing tensor layers.27.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[250/291] Writing tensor layers.27.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[251/291] Writing tensor layers.27.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[252/291] Writing tensor layers.27.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[253/291] Writing tensor layers.27.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[254/291] Writing tensor layers.27.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[255/291] Writing tensor layers.27.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[256/291] Writing tensor layers.28.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[257/291] Writing tensor layers.28.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[258/291] Writing tensor layers.28.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[259/291] Writing tensor layers.28.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[260/291] Writing tensor layers.28.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[261/291] Writing tensor layers.28.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[262/291] Writing tensor layers.28.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[263/291] Writing tensor layers.28.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[264/291] Writing tensor layers.28.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[265/291] Writing tensor layers.29.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[266/291] Writing tensor layers.29.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[267/291] Writing tensor layers.29.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[268/291] Writing tensor layers.29.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[269/291] Writing tensor layers.29.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[270/291] Writing tensor layers.29.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[271/291] Writing tensor layers.29.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[272/291] Writing tensor layers.29.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[273/291] Writing tensor layers.29.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[274/291] Writing tensor layers.30.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[275/291] Writing tensor layers.30.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[276/291] Writing tensor layers.30.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[277/291] Writing tensor layers.30.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[278/291] Writing tensor layers.30.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[279/291] Writing tensor layers.30.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[280/291] Writing tensor layers.30.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[281/291] Writing tensor layers.30.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[282/291] Writing tensor layers.30.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[283/291] Writing tensor layers.31.attention.wq.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[284/291] Writing tensor layers.31.attention.wk.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[285/291] Writing tensor layers.31.attention.wv.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[286/291] Writing tensor layers.31.attention.wo.weight | size 4096 x 4096 | type UnquantizedDataType(name='F16')\n", + "[287/291] Writing tensor layers.31.attention_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "[288/291] Writing tensor layers.31.feed_forward.w1.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[289/291] Writing tensor layers.31.feed_forward.w2.weight | size 4096 x 11008 | type UnquantizedDataType(name='F16')\n", + "[290/291] Writing tensor layers.31.feed_forward.w3.weight | size 11008 x 4096 | type UnquantizedDataType(name='F16')\n", + "[291/291] Writing tensor layers.31.ffn_norm.weight | size 4096 | type UnquantizedDataType(name='F32')\n", + "Wrote zh-models/7B/ggml-model-f16.bin\n" + ] + } + ], + "source": [ + "!cd llama.cpp && python convert.py zh-models/7B/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hEZEJAVYCHkc" + }, + "source": [ + "### 将FP16模型量化为8-bit\n", + "\n", + "我们进一步将FP16模型转换为8-bit量化模型。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2xyais7OUVDI", + "outputId": "b7fe3c62-489a-42e5-927a-8ab6088a3ecc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "llama.cpp: loading model from ./zh-models/7B/ggml-model-f16.bin\n", + "llama.cpp: saving model to ./zh-models/7B/ggml-model-q4_0.bin\n", + "[ 1/ 291] tok_embeddings.weight - 4096 x 49954, type = f16, quantizing .. size = 390.27 MB -> 219.52 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 2/ 291] norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 3/ 291] output.weight - 4096 x 49954, type = f16, quantizing .. size = 390.27 MB -> 219.52 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 4/ 291] layers.0.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.026 0.018 0.028 0.044 0.064 0.088 0.111 0.245 0.111 0.087 0.064 0.044 0.028 0.018 0.026 \n", + "[ 5/ 291] layers.0.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.026 0.017 0.028 0.043 0.063 0.087 0.111 0.250 0.112 0.087 0.063 0.043 0.028 0.017 0.026 \n", + "[ 6/ 291] layers.0.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.019 0.031 0.046 0.065 0.087 0.107 0.237 0.107 0.087 0.065 0.046 0.030 0.019 0.027 \n", + "[ 7/ 291] layers.0.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.026 0.017 0.027 0.042 0.062 0.087 0.113 0.253 0.113 0.087 0.062 0.042 0.027 0.017 0.026 \n", + "[ 8/ 291] layers.0.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 9/ 291] layers.0.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 10/ 291] layers.0.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 11/ 291] layers.0.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 12/ 291] layers.0.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 13/ 291] layers.1.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.107 0.228 0.107 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 14/ 291] layers.1.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.019 0.031 0.047 0.067 0.088 0.107 0.229 0.107 0.088 0.067 0.047 0.031 0.019 0.027 \n", + "[ 15/ 291] layers.1.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.019 0.030 0.046 0.066 0.088 0.108 0.235 0.108 0.088 0.065 0.046 0.030 0.019 0.027 \n", + "[ 16/ 291] layers.1.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.026 0.017 0.027 0.042 0.062 0.087 0.113 0.256 0.113 0.086 0.062 0.042 0.027 0.017 0.026 \n", + "[ 17/ 291] layers.1.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 18/ 291] layers.1.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 19/ 291] layers.1.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 20/ 291] layers.1.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 21/ 291] layers.1.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 22/ 291] layers.2.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 23/ 291] layers.2.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.019 0.031 0.047 0.066 0.088 0.107 0.231 0.107 0.088 0.066 0.047 0.031 0.019 0.027 \n", + "[ 24/ 291] layers.2.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.087 0.106 0.228 0.106 0.087 0.067 0.047 0.031 0.020 0.027 \n", + "[ 25/ 291] layers.2.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.107 0.228 0.107 0.088 0.067 0.047 0.031 0.019 0.027 \n", + "[ 26/ 291] layers.2.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 27/ 291] layers.2.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 28/ 291] layers.2.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 29/ 291] layers.2.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 30/ 291] layers.2.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 31/ 291] layers.3.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 32/ 291] layers.3.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.229 0.106 0.088 0.066 0.047 0.031 0.020 0.027 \n", + "[ 33/ 291] layers.3.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 34/ 291] layers.3.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 35/ 291] layers.3.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 36/ 291] layers.3.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 37/ 291] layers.3.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 38/ 291] layers.3.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 39/ 291] layers.3.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 40/ 291] layers.4.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 41/ 291] layers.4.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 42/ 291] layers.4.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 43/ 291] layers.4.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 44/ 291] layers.4.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 45/ 291] layers.4.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 46/ 291] layers.4.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 47/ 291] layers.4.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 48/ 291] layers.4.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 49/ 291] layers.5.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 50/ 291] layers.5.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 51/ 291] layers.5.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 52/ 291] layers.5.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 53/ 291] layers.5.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 54/ 291] layers.5.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 55/ 291] layers.5.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 56/ 291] layers.5.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 57/ 291] layers.5.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 58/ 291] layers.6.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 59/ 291] layers.6.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n", + "[ 60/ 291] layers.6.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 61/ 291] layers.6.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 62/ 291] layers.6.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 63/ 291] layers.6.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 64/ 291] layers.6.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 65/ 291] layers.6.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 66/ 291] layers.6.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 67/ 291] layers.7.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 68/ 291] layers.7.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 69/ 291] layers.7.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 70/ 291] layers.7.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 71/ 291] layers.7.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 72/ 291] layers.7.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 73/ 291] layers.7.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n", + "[ 74/ 291] layers.7.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 75/ 291] layers.7.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 76/ 291] layers.8.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 77/ 291] layers.8.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n", + "[ 78/ 291] layers.8.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 79/ 291] layers.8.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 80/ 291] layers.8.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 81/ 291] layers.8.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 82/ 291] layers.8.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 83/ 291] layers.8.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 84/ 291] layers.8.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 85/ 291] layers.9.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 86/ 291] layers.9.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 87/ 291] layers.9.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 88/ 291] layers.9.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 89/ 291] layers.9.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 90/ 291] layers.9.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 91/ 291] layers.9.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 92/ 291] layers.9.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 93/ 291] layers.9.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 94/ 291] layers.10.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 95/ 291] layers.10.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 96/ 291] layers.10.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 97/ 291] layers.10.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 98/ 291] layers.10.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 99/ 291] layers.10.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 100/ 291] layers.10.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 101/ 291] layers.10.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 102/ 291] layers.10.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 103/ 291] layers.11.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 104/ 291] layers.11.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 105/ 291] layers.11.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 106/ 291] layers.11.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 107/ 291] layers.11.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 108/ 291] layers.11.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 109/ 291] layers.11.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 110/ 291] layers.11.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 111/ 291] layers.11.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 112/ 291] layers.12.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 113/ 291] layers.12.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 114/ 291] layers.12.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 115/ 291] layers.12.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 116/ 291] layers.12.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 117/ 291] layers.12.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 118/ 291] layers.12.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 119/ 291] layers.12.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 120/ 291] layers.12.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 121/ 291] layers.13.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 122/ 291] layers.13.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 123/ 291] layers.13.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 124/ 291] layers.13.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 125/ 291] layers.13.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 126/ 291] layers.13.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 127/ 291] layers.13.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 128/ 291] layers.13.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 129/ 291] layers.13.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 130/ 291] layers.14.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 131/ 291] layers.14.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 132/ 291] layers.14.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 133/ 291] layers.14.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 134/ 291] layers.14.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 135/ 291] layers.14.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 136/ 291] layers.14.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 137/ 291] layers.14.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 138/ 291] layers.14.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 139/ 291] layers.15.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 140/ 291] layers.15.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 141/ 291] layers.15.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 142/ 291] layers.15.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 143/ 291] layers.15.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 144/ 291] layers.15.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 145/ 291] layers.15.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 146/ 291] layers.15.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 147/ 291] layers.15.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 148/ 291] layers.16.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 149/ 291] layers.16.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 150/ 291] layers.16.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 151/ 291] layers.16.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 152/ 291] layers.16.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 153/ 291] layers.16.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 154/ 291] layers.16.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 155/ 291] layers.16.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 156/ 291] layers.16.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 157/ 291] layers.17.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 158/ 291] layers.17.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 159/ 291] layers.17.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 160/ 291] layers.17.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 161/ 291] layers.17.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 162/ 291] layers.17.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 163/ 291] layers.17.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.031 0.020 0.027 \n", + "[ 164/ 291] layers.17.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 165/ 291] layers.17.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 166/ 291] layers.18.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 167/ 291] layers.18.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 168/ 291] layers.18.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 169/ 291] layers.18.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 170/ 291] layers.18.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 171/ 291] layers.18.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 172/ 291] layers.18.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 173/ 291] layers.18.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 174/ 291] layers.18.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 175/ 291] layers.19.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 176/ 291] layers.19.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 177/ 291] layers.19.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 178/ 291] layers.19.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 179/ 291] layers.19.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 180/ 291] layers.19.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 181/ 291] layers.19.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 182/ 291] layers.19.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 183/ 291] layers.19.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 184/ 291] layers.20.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 185/ 291] layers.20.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 186/ 291] layers.20.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 187/ 291] layers.20.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 188/ 291] layers.20.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 189/ 291] layers.20.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 190/ 291] layers.20.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 191/ 291] layers.20.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 192/ 291] layers.20.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 193/ 291] layers.21.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 194/ 291] layers.21.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 195/ 291] layers.21.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 196/ 291] layers.21.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 197/ 291] layers.21.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 198/ 291] layers.21.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 199/ 291] layers.21.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 200/ 291] layers.21.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 201/ 291] layers.21.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 202/ 291] layers.22.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 203/ 291] layers.22.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 204/ 291] layers.22.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 205/ 291] layers.22.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 206/ 291] layers.22.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 207/ 291] layers.22.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 208/ 291] layers.22.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 209/ 291] layers.22.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 210/ 291] layers.22.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 211/ 291] layers.23.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 212/ 291] layers.23.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 213/ 291] layers.23.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 214/ 291] layers.23.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 215/ 291] layers.23.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 216/ 291] layers.23.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 217/ 291] layers.23.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 218/ 291] layers.23.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 219/ 291] layers.23.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 220/ 291] layers.24.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 221/ 291] layers.24.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 222/ 291] layers.24.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 223/ 291] layers.24.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.105 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 224/ 291] layers.24.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 225/ 291] layers.24.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 226/ 291] layers.24.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 227/ 291] layers.24.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 228/ 291] layers.24.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 229/ 291] layers.25.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 230/ 291] layers.25.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 231/ 291] layers.25.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 232/ 291] layers.25.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 233/ 291] layers.25.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 234/ 291] layers.25.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 235/ 291] layers.25.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 236/ 291] layers.25.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 237/ 291] layers.25.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 238/ 291] layers.26.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 239/ 291] layers.26.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 240/ 291] layers.26.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 241/ 291] layers.26.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 242/ 291] layers.26.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 243/ 291] layers.26.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.068 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 244/ 291] layers.26.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 245/ 291] layers.26.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 246/ 291] layers.26.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 247/ 291] layers.27.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 248/ 291] layers.27.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 249/ 291] layers.27.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 250/ 291] layers.27.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 251/ 291] layers.27.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 252/ 291] layers.27.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.028 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 253/ 291] layers.27.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 254/ 291] layers.27.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 255/ 291] layers.27.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 256/ 291] layers.28.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 257/ 291] layers.28.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 258/ 291] layers.28.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 259/ 291] layers.28.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 260/ 291] layers.28.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 261/ 291] layers.28.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.105 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 262/ 291] layers.28.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 263/ 291] layers.28.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 264/ 291] layers.28.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 265/ 291] layers.29.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 266/ 291] layers.29.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 267/ 291] layers.29.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 268/ 291] layers.29.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 269/ 291] layers.29.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 270/ 291] layers.29.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.224 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 271/ 291] layers.29.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.227 0.107 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 272/ 291] layers.29.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 273/ 291] layers.29.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 274/ 291] layers.30.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 275/ 291] layers.30.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 276/ 291] layers.30.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.032 0.020 0.027 \n", + "[ 277/ 291] layers.30.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 278/ 291] layers.30.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 279/ 291] layers.30.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 280/ 291] layers.30.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.019 0.030 0.046 0.066 0.088 0.108 0.232 0.108 0.088 0.066 0.046 0.031 0.019 0.027 \n", + "[ 281/ 291] layers.30.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 282/ 291] layers.30.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 283/ 291] layers.31.attention.wq.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 284/ 291] layers.31.attention.wk.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 285/ 291] layers.31.attention.wv.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.031 0.047 0.067 0.088 0.106 0.228 0.106 0.088 0.067 0.047 0.031 0.020 0.027 \n", + "[ 286/ 291] layers.31.attention.wo.weight - 4096 x 4096, type = f16, quantizing .. size = 32.00 MB -> 18.00 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 287/ 291] layers.31.attention_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "[ 288/ 291] layers.31.feed_forward.w1.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.225 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 289/ 291] layers.31.feed_forward.w2.weight - 11008 x 4096, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.019 0.030 0.045 0.065 0.088 0.109 0.237 0.109 0.088 0.065 0.045 0.030 0.019 0.027 \n", + "[ 290/ 291] layers.31.feed_forward.w3.weight - 4096 x 11008, type = f16, quantizing .. size = 86.00 MB -> 48.38 MB | hist: 0.000 0.027 0.020 0.032 0.047 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "[ 291/ 291] layers.31.ffn_norm.weight - 4096, type = f32, size = 0.016 MB\n", + "llama_model_quantize_internal: model size = 13133.55 MB\n", + "llama_model_quantize_internal: quant size = 7388.06 MB\n", + "llama_model_quantize_internal: hist: 0.000 0.027 0.020 0.032 0.048 0.067 0.088 0.106 0.226 0.106 0.088 0.067 0.048 0.032 0.020 0.027 \n", + "\n", + "main: quantize time = 146381.23 ms\n", + "main: total time = 146381.23 ms\n" + ] + } + ], + "source": [ + "!cd llama.cpp && ./quantize ./zh-models/7B/ggml-model-f16.bin ./zh-models/7B/ggml-model-q8_0.bin 7" + ] + }, + { + "cell_type": "code", + "source": [ + "!sha256sum ./llama.cpp/zh-models/7B/ggml-model-q8_0.bin" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2PR5jo2P-hOw", + "outputId": "2d808543-557d-4d0a-becb-ab35c4ccb8ff" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0eec8927427f159397c79961a28d62d78849514a4a19033b247edd6ac3fc2cfd ./llama.cpp/zh-models/7B/ggml-model-q8_0.bin\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DLkuRAo9Vkb1" + }, + "source": [ + "### (可选)测试量化模型解码\n", + "至此已完成了所有转换步骤。\n", + "我们运行一条命令测试一下是否能够正常加载并进行对话。\n", + "\n", + "FP16和Q8量化文件存放在./llama.cpp/zh-models/7B下,可按需下载使用。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tW-ep1BsVQtG", + "outputId": "b3b28e5e-c731-4bb5-d3ae-c09d4c7bfb81" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "main: seed = 1682671021\n", + "llama.cpp: loading model from ./zh-models/7B/ggml-model-q8_0.bin\n", + "llama_model_load_internal: format = ggjt v1 (latest)\n", + "llama_model_load_internal: n_vocab = 49954\n", + "llama_model_load_internal: n_ctx = 512\n", + "llama_model_load_internal: n_embd = 4096\n", + "llama_model_load_internal: n_mult = 256\n", + "llama_model_load_internal: n_head = 32\n", + "llama_model_load_internal: n_layer = 32\n", + "llama_model_load_internal: n_rot = 128\n", + "llama_model_load_internal: ftype = 7 (mostly Q8_0)\n", + "llama_model_load_internal: n_ff = 11008\n", + "llama_model_load_internal: n_parts = 1\n", + "llama_model_load_internal: model size = 7B\n", + "llama_model_load_internal: ggml ctx size = 59.11 KB\n", + "llama_model_load_internal: mem required = 9180.12 MB (+ 1026.00 MB per state)\n", + "llama_init_from_file: kv self size = 256.00 MB\n", + "\n", + "system_info: n_threads = 4 / 4 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n", + "sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.100000\n", + "generate: n_ctx = 512, n_batch = 512, n_predict = 512, n_keep = 0\n", + "\n", + "\n", + "\u001b[33m 详细介绍一下北京的名胜古迹:\u001b[0m长城、故宫等。同时介绍一些小众景点,比如颐和园中的石舫、圆明园中的琉璃花门等等。 [end of text]\n", + "\n", + "llama_print_timings: load time = 19881.66 ms\n", + "llama_print_timings: sample time = 48.31 ms / 32 runs ( 1.51 ms per run)\n", + "llama_print_timings: prompt eval time = 11365.17 ms / 11 tokens ( 1033.20 ms per token)\n", + "llama_print_timings: eval time = 33910.03 ms / 31 runs ( 1093.87 ms per run)\n", + "llama_print_timings: total time = 53841.09 ms\n" + ] + } + ], + "source": [ + "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q8_0.bin --color -f ./prompts/alpaca.txt -p \"详细介绍一下北京的名胜古迹:\" -n 512" + ] + } + ], + "metadata": { + "accelerator": "TPU", + "colab": { + "machine_shape": "hm", + "provenance": [] + }, + "gpuClass": "premium", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/notebooks/legacy/convert_and_quantize_chinese_llama.ipynb b/notebooks/legacy/convert_and_quantize_chinese_llama.ipynb new file mode 100644 index 0000000..ce077f3 --- /dev/null +++ b/notebooks/legacy/convert_and_quantize_chinese_llama.ipynb @@ -0,0 +1,1874 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "gpuClass": "standard", + "accelerator": "TPU" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 转换并量化中文LLaMA/Alpaca模型\n", + "\n", + "🎉🎉🎉 **新:现在免费用户也有机会能够转换7B和13B模型了!**\n", + "\n", + "💡 提示和小窍门:\n", + "- 免费用户默认的内存只有12G左右,**笔者用免费账号实测选择TPU的话有机会随机出35G内存**,建议多试几次。如果能随机出25G内存以上的机器就可以了转换7B模型了,35G内存以上机器就能转换13B模型了\n", + "- Pro(+)用户请选择 “代码执行程序” -> “更改运行时类型” -> “高RAM”\n", + "- 实测:转换7B级别模型,25G内存的机器就够了;转换13B级别模型需要30G以上的内存(程序莫名崩掉或断开连接就说明内存爆了)\n", + "- 如果选了“高RAM”之后内存还是不够大的话,选择以下操作,有的时候会分配出很高内存的机器,祝你好运😄!\n", + " - 可以把GPU或者TPU也选上(虽然不会用到)\n", + " - 选GPU时,Pro用户可选“高级”类型GPU\n", + "\n", + "以下信息配置信息供参考(Pro订阅下测试),运行时规格设置为“高RAM”时的设备配置如下(有随机性):\n", + "\n", + "| 硬件加速器 | RAM | 硬盘 |\n", + "| :-- | :--: | :--: |\n", + "| None | 25GB | 225GB |\n", + "| TPU | 35GB | 225GB |\n", + "| GPU(标准,T4)| 25GB | 166GB |\n", + "| GPU(高性能,V100)| 25GB | 166GB |\n", + "| GPU(高性能,A100)| **80GB** | 166GB |\n", + "\n", + "*温馨提示:用完之后注意断开运行时,选择满足要求的最低配置即可,避免不必要的计算单元消耗(Pro只给100个计算单元)。*" + ], + "metadata": { + "id": "B1c96_k3MahN" + } + }, + { + "cell_type": "markdown", + "source": [ + "## 安装相关依赖" + ], + "metadata": { + "id": "vScqHD_jMFOV" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E5WKFJXIL6ZU", + "outputId": "7ce317e5-c105-49a8-d1af-70c29e6246e1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting transformers\n", + " Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.0/7.0 MB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.27.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (6.0)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.65.0)\n", + "Collecting huggingface-hub<1.0,>=0.11.0\n", + " Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m200.1/200.1 kB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.11.0)\n", + "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n", + " Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m97.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2022.12.7)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.15)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (3.4)\n", + "Installing collected packages: tokenizers, huggingface-hub, transformers\n", + "Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting peft\n", + " Downloading peft-0.2.0-py3-none-any.whl (40 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.3/40.3 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.9/dist-packages (from peft) (5.9.4)\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.9/dist-packages (from peft) (4.28.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.9/dist-packages (from peft) (6.0)\n", + "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.9/dist-packages (from peft) (2.0.0+cu118)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from peft) (23.0)\n", + "Collecting accelerate\n", + " Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.3/215.3 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from peft) (1.24.2)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (1.11.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (2.0.0)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (3.11.0)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from torch>=1.13.0->peft) (4.5.0)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch>=1.13.0->peft) (16.0.1)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch>=1.13.0->peft) (3.25.2)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (0.13.4)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (2022.10.31)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (2.27.1)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (0.13.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers->peft) (4.65.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->torch>=1.13.0->peft) (2.1.2)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (3.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers->peft) (2022.12.7)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.9/dist-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\n", + "Installing collected packages: accelerate, peft\n", + "Successfully installed accelerate-0.18.0 peft-0.2.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting sentencepiece\n", + " Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: sentencepiece\n", + "Successfully installed sentencepiece-0.1.98\n" + ] + } + ], + "source": [ + "!pip install transformers\n", + "!pip install peft\n", + "!pip install sentencepiece" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 克隆目录和代码" + ], + "metadata": { + "id": "ygb1xFIMNQKw" + } + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca\n", + "!git clone https://github.com/ggerganov/llama.cpp" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yCEJh7NJNXz9", + "outputId": "91a0e4ff-af63-4f8e-ab82-ee4ddf583033" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'Chinese-LLaMA-Alpaca'...\n", + "remote: Enumerating objects: 559, done.\u001b[K\n", + "remote: Counting objects: 100% (129/129), done.\u001b[K\n", + "remote: Compressing objects: 100% (115/115), done.\u001b[K\n", + "remote: Total 559 (delta 30), reused 22 (delta 14), pack-reused 430\u001b[K\n", + "Receiving objects: 100% (559/559), 10.71 MiB | 25.49 MiB/s, done.\n", + "Resolving deltas: 100% (333/333), done.\n", + "Cloning into 'llama.cpp'...\n", + "remote: Enumerating objects: 1701, done.\u001b[K\n", + "remote: Counting objects: 100% (1701/1701), done.\u001b[K\n", + "remote: Compressing objects: 100% (620/620), done.\u001b[K\n", + "remote: Total 1701 (delta 1084), reused 1623 (delta 1047), pack-reused 0\u001b[K\n", + "Receiving objects: 100% (1701/1701), 1.86 MiB | 14.74 MiB/s, done.\n", + "Resolving deltas: 100% (1084/1084), done.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 合并模型(以Alpaca-7B为例)\n", + "\n", + "**⚠️ 再次提醒:7B模型需要25G内存,13B模型需要35G+内存。**\n", + "\n", + "此处使用的是🤗模型库中提供的基模型(已是HF格式),而不是Facebook官方的LLaMA模型,因此略去将原版LLaMA转换为HF格式的步骤。\n", + "\n", + "**这里直接运行第二步:合并LoRA权重**,生成全量模型权重。可以直接指定🤗模型库的地址,也可以是本地存放地址。\n", + "- 基模型:`decapoda-research/llama-7b-hf` *(use at your own risk)*\n", + "- LoRA模型:`ziqingyang/chinese-alpaca-lora-7b`\n", + "\n", + "💡 转换13B模型提示:\n", + "- 请将参数`--base_model`和`--lora_model`中的的`7b`改为`13b`即可\n", + "- **免费用户必须增加一个参数`--offload_dir`以缓解内存压力**,例如`--offload_dir ./offload_temp`\n", + "\n", + "该过程比较耗时(下载+转换),需要几分钟到十几分钟不等,请耐心等待。\n", + "转换好的模型存放在`alpaca-combined`目录。\n", + "如果你不需要量化模型,那么到这一步就结束了。" + ], + "metadata": { + "id": "nIyxX0DSNsgQ" + } + }, + { + "cell_type": "code", + "source": [ + "!python ./Chinese-LLaMA-Alpaca/scripts/merge_llama_with_chinese_lora.py \\\n", + " --base_model 'decapoda-research/llama-7b-hf' \\\n", + " --lora_model 'ziqingyang/chinese-alpaca-lora-7b' \\\n", + " --output_dir alpaca-combined" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5AV4EW5hNhVV", + "outputId": "e34419d4-b7c9-4e22-af37-abf80d4163ba" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-04-14 10:13:45.382526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "Downloading tokenizer.model: 100% 758k/758k [00:00<00:00, 12.7MB/s]\n", + "Downloading (…)cial_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 15.3kB/s]\n", + "Downloading (…)okenizer_config.json: 100% 166/166 [00:00<00:00, 63.2kB/s]\n", + "Downloading (…)lve/main/config.json: 100% 427/427 [00:00<00:00, 63.4kB/s]\n", + "Downloading (…)model.bin.index.json: 100% 25.5k/25.5k [00:00<00:00, 9.41MB/s]\n", + "Downloading shards: 0% 0/33 [00:00 121.96 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[2/291] norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[3/291] output.weight - [4096 x 49954], type = f16, quantizing .. size = 390.27 MB -> 121.96 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[4/291] layers.0.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.071 0.103 0.137 0.158 0.137 0.103 0.071 0.046 0.028 0.016 0.021 \n", + "[5/291] layers.0.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.016 0.027 0.045 0.071 0.104 0.138 0.158 0.139 0.104 0.071 0.045 0.027 0.016 0.021 \n", + "[6/291] layers.0.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.018 0.032 0.051 0.076 0.103 0.128 0.141 0.128 0.103 0.075 0.051 0.032 0.019 0.022 \n", + "[7/291] layers.0.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.072 0.105 0.136 0.151 0.136 0.105 0.072 0.046 0.028 0.016 0.021 \n", + "[8/291] layers.0.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[9/291] layers.0.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[10/291] layers.0.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[11/291] layers.0.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[12/291] layers.0.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[13/291] layers.1.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.051 0.077 0.104 0.127 0.137 0.127 0.104 0.077 0.051 0.032 0.019 0.022 \n", + "[14/291] layers.1.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.018 0.032 0.051 0.076 0.104 0.128 0.138 0.128 0.104 0.077 0.051 0.032 0.018 0.022 \n", + "[15/291] layers.1.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.018 0.031 0.051 0.076 0.104 0.129 0.139 0.129 0.104 0.076 0.051 0.031 0.018 0.021 \n", + "[16/291] layers.1.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.021 0.016 0.028 0.046 0.071 0.104 0.137 0.154 0.137 0.104 0.071 0.046 0.028 0.016 0.021 \n", + "[17/291] layers.1.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[18/291] layers.1.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[19/291] layers.1.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[20/291] layers.1.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[21/291] layers.1.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[22/291] layers.2.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[23/291] layers.2.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.051 0.076 0.104 0.127 0.137 0.127 0.104 0.077 0.051 0.032 0.019 0.022 \n", + "[24/291] layers.2.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.136 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[25/291] layers.2.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", + "[26/291] layers.2.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[27/291] layers.2.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[28/291] layers.2.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[29/291] layers.2.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[30/291] layers.2.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[31/291] layers.3.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", + "[32/291] layers.3.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.136 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", + "[33/291] layers.3.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[34/291] layers.3.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[35/291] layers.3.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[36/291] layers.3.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[37/291] layers.3.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[38/291] layers.3.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[39/291] layers.3.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[40/291] layers.4.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[41/291] layers.4.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", + "[42/291] layers.4.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[43/291] layers.4.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[44/291] layers.4.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[45/291] layers.4.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[46/291] layers.4.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[47/291] layers.4.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[48/291] layers.4.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[49/291] layers.5.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[50/291] layers.5.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[51/291] layers.5.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[52/291] layers.5.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[53/291] layers.5.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[54/291] layers.5.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[55/291] layers.5.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[56/291] layers.5.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[57/291] layers.5.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[58/291] layers.6.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[59/291] layers.6.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[60/291] layers.6.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[61/291] layers.6.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[62/291] layers.6.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[63/291] layers.6.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[64/291] layers.6.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[65/291] layers.6.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[66/291] layers.6.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[67/291] layers.7.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[68/291] layers.7.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[69/291] layers.7.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[70/291] layers.7.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[71/291] layers.7.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[72/291] layers.7.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[73/291] layers.7.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[74/291] layers.7.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[75/291] layers.7.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[76/291] layers.8.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[77/291] layers.8.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n", + "[78/291] layers.8.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[79/291] layers.8.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[80/291] layers.8.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[81/291] layers.8.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[82/291] layers.8.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n", + "[83/291] layers.8.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[84/291] layers.8.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[85/291] layers.9.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[86/291] layers.9.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[87/291] layers.9.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[88/291] layers.9.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[89/291] layers.9.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[90/291] layers.9.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[91/291] layers.9.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[92/291] layers.9.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[93/291] layers.9.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[94/291] layers.10.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[95/291] layers.10.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[96/291] layers.10.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[97/291] layers.10.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[98/291] layers.10.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[99/291] layers.10.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[100/291] layers.10.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[101/291] layers.10.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[102/291] layers.10.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[103/291] layers.11.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[104/291] layers.11.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[105/291] layers.11.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[106/291] layers.11.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[107/291] layers.11.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[108/291] layers.11.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[109/291] layers.11.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", + "[110/291] layers.11.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[111/291] layers.11.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[112/291] layers.12.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[113/291] layers.12.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[114/291] layers.12.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[115/291] layers.12.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[116/291] layers.12.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[117/291] layers.12.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[118/291] layers.12.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[119/291] layers.12.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[120/291] layers.12.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[121/291] layers.13.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[122/291] layers.13.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n", + "[123/291] layers.13.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[124/291] layers.13.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[125/291] layers.13.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[126/291] layers.13.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[127/291] layers.13.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n", + "[128/291] layers.13.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[129/291] layers.13.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[130/291] layers.14.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[131/291] layers.14.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[132/291] layers.14.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[133/291] layers.14.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[134/291] layers.14.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[135/291] layers.14.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[136/291] layers.14.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[137/291] layers.14.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[138/291] layers.14.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[139/291] layers.15.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[140/291] layers.15.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[141/291] layers.15.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[142/291] layers.15.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[143/291] layers.15.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[144/291] layers.15.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[145/291] layers.15.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[146/291] layers.15.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[147/291] layers.15.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[148/291] layers.16.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[149/291] layers.16.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[150/291] layers.16.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.125 0.135 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[151/291] layers.16.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[152/291] layers.16.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[153/291] layers.16.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[154/291] layers.16.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.126 0.134 0.126 0.104 0.078 0.052 0.033 0.019 0.022 \n", + "[155/291] layers.16.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[156/291] layers.16.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[157/291] layers.17.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[158/291] layers.17.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[159/291] layers.17.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[160/291] layers.17.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[161/291] layers.17.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[162/291] layers.17.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[163/291] layers.17.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.052 0.033 0.019 0.022 \n", + "[164/291] layers.17.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[165/291] layers.17.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[166/291] layers.18.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[167/291] layers.18.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[168/291] layers.18.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[169/291] layers.18.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[170/291] layers.18.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[171/291] layers.18.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[172/291] layers.18.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[173/291] layers.18.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[174/291] layers.18.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[175/291] layers.19.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[176/291] layers.19.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[177/291] layers.19.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[178/291] layers.19.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[179/291] layers.19.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[180/291] layers.19.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[181/291] layers.19.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[182/291] layers.19.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[183/291] layers.19.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[184/291] layers.20.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[185/291] layers.20.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[186/291] layers.20.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[187/291] layers.20.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[188/291] layers.20.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[189/291] layers.20.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[190/291] layers.20.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[191/291] layers.20.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[192/291] layers.20.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[193/291] layers.21.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[194/291] layers.21.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[195/291] layers.21.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[196/291] layers.21.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[197/291] layers.21.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[198/291] layers.21.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[199/291] layers.21.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[200/291] layers.21.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[201/291] layers.21.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[202/291] layers.22.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[203/291] layers.22.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[204/291] layers.22.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[205/291] layers.22.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.124 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[206/291] layers.22.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[207/291] layers.22.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[208/291] layers.22.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[209/291] layers.22.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[210/291] layers.22.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[211/291] layers.23.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[212/291] layers.23.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[213/291] layers.23.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[214/291] layers.23.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[215/291] layers.23.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[216/291] layers.23.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[217/291] layers.23.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[218/291] layers.23.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[219/291] layers.23.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[220/291] layers.24.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[221/291] layers.24.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[222/291] layers.24.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[223/291] layers.24.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.124 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[224/291] layers.24.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[225/291] layers.24.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[226/291] layers.24.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[227/291] layers.24.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[228/291] layers.24.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[229/291] layers.25.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[230/291] layers.25.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[231/291] layers.25.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[232/291] layers.25.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[233/291] layers.25.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[234/291] layers.25.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[235/291] layers.25.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[236/291] layers.25.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[237/291] layers.25.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[238/291] layers.26.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[239/291] layers.26.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[240/291] layers.26.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[241/291] layers.26.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[242/291] layers.26.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[243/291] layers.26.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[244/291] layers.26.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[245/291] layers.26.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[246/291] layers.26.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[247/291] layers.27.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[248/291] layers.27.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[249/291] layers.27.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[250/291] layers.27.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[251/291] layers.27.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[252/291] layers.27.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[253/291] layers.27.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[254/291] layers.27.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[255/291] layers.27.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[256/291] layers.28.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[257/291] layers.28.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[258/291] layers.28.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[259/291] layers.28.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[260/291] layers.28.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[261/291] layers.28.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.132 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[262/291] layers.28.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[263/291] layers.28.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[264/291] layers.28.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[265/291] layers.29.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[266/291] layers.29.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[267/291] layers.29.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[268/291] layers.29.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[269/291] layers.29.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[270/291] layers.29.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[271/291] layers.29.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", + "[272/291] layers.29.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[273/291] layers.29.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[274/291] layers.30.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.126 0.134 0.125 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[275/291] layers.30.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[276/291] layers.30.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.077 0.053 0.033 0.019 0.022 \n", + "[277/291] layers.30.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[278/291] layers.30.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[279/291] layers.30.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[280/291] layers.30.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.018 0.032 0.051 0.076 0.104 0.128 0.137 0.128 0.104 0.076 0.051 0.032 0.018 0.022 \n", + "[281/291] layers.30.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[282/291] layers.30.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[283/291] layers.31.attention.wq.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", + "[284/291] layers.31.attention.wk.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.052 0.077 0.104 0.126 0.134 0.126 0.104 0.077 0.052 0.033 0.019 0.022 \n", + "[285/291] layers.31.attention.wv.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.032 0.052 0.077 0.104 0.126 0.135 0.126 0.104 0.077 0.052 0.032 0.019 0.022 \n", + "[286/291] layers.31.attention.wo.weight - [4096 x 4096], type = f16, quantizing .. size = 32.00 MB -> 10.00 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[287/291] layers.31.attention_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "[288/291] layers.31.feed_forward.w1.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.133 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[289/291] layers.31.feed_forward.w2.weight - [11008 x 4096], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.021 0.018 0.031 0.050 0.075 0.104 0.130 0.140 0.130 0.104 0.075 0.050 0.031 0.018 0.021 \n", + "[290/291] layers.31.feed_forward.w3.weight - [4096 x 11008], type = f16, quantizing .. size = 86.00 MB -> 26.88 MB | hist: 0.000 0.022 0.019 0.033 0.053 0.077 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "[291/291] layers.31.ffn_norm.weight - [4096], type = f32, size = 0.016 MB\n", + "llama_model_quantize_internal: model size = 13133.55 MB\n", + "llama_model_quantize_internal: quant size = 4104.93 MB\n", + "llama_model_quantize_internal: hist: 0.000 0.022 0.019 0.033 0.053 0.078 0.104 0.125 0.134 0.125 0.104 0.078 0.053 0.033 0.019 0.022 \n", + "\n", + "main: quantize time = 178732.41 ms\n", + "main: total time = 178732.41 ms\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### (可选)测试量化模型解码\n", + "至此已完成了所有转换步骤。\n", + "我们运行一条命令测试一下是否能够正常加载并进行对话。\n", + "\n", + "FP16和Q4量化文件存放在./llama.cpp/zh-models/7B下,可按需下载使用。" + ], + "metadata": { + "id": "DLkuRAo9Vkb1" + } + }, + { + "cell_type": "code", + "source": [ + "!cd llama.cpp && ./main -m ./zh-models/7B/ggml-model-q4_0.bin --color -f ./prompts/alpaca.txt -p \"详细介绍一下北京的名胜古迹:\" -n 512" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tW-ep1BsVQtG", + "outputId": "0706c974-127e-4f21-be6b-d71ea4fb989b" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "main: seed = 1681467955\n", + "llama.cpp: loading model from ./zh-models/7B/ggml-model-q4_0.bin\n", + "llama_model_load_internal: format = ggjt v1 (latest)\n", + "llama_model_load_internal: n_vocab = 49954\n", + "llama_model_load_internal: n_ctx = 512\n", + "llama_model_load_internal: n_embd = 4096\n", + "llama_model_load_internal: n_mult = 256\n", + "llama_model_load_internal: n_head = 32\n", + "llama_model_load_internal: n_layer = 32\n", + "llama_model_load_internal: n_rot = 128\n", + "llama_model_load_internal: ftype = 2 (mostly Q4_0)\n", + "llama_model_load_internal: n_ff = 11008\n", + "llama_model_load_internal: n_parts = 1\n", + "llama_model_load_internal: model size = 7B\n", + "llama_model_load_internal: ggml ctx size = 59.11 KB\n", + "llama_model_load_internal: mem required = 5896.99 MB (+ 1026.00 MB per state)\n", + "llama_init_from_file: kv self size = 256.00 MB\n", + "\n", + "system_info: n_threads = 40 / 40 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n", + "sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.100000\n", + "generate: n_ctx = 512, n_batch = 8, n_predict = 512, n_keep = 0\n", + "\n", + "\n", + "\u001b[33m 详细介绍一下北京的名胜古迹:\u001b[0m\n", + " 故宫:明、清两代皇室,御花园及八达门大街。 宫殿内有大量文物珍品; [end of text]\n", + "\n", + "llama_print_timings: load time = 717.01 ms\n", + "llama_print_timings: sample time = 48.97 ms / 32 runs ( 1.53 ms per run)\n", + "llama_print_timings: prompt eval time = 680.93 ms / 11 tokens ( 61.90 ms per token)\n", + "llama_print_timings: eval time = 4490.00 ms / 31 runs ( 144.84 ms per run)\n", + "llama_print_timings: total time = 5461.05 ms\n" + ] + } + ] + } + ] +} \ No newline at end of file From a5ef187b9d3610245f8e2992dffa91bc6807775f Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Fri, 16 Jun 2023 00:06:10 +0800 Subject: [PATCH 4/9] add assertions --- .../merge_llama_with_chinese_lora_low_mem.py | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py index a770a2c..2291f79 100644 --- a/scripts/merge_llama_with_chinese_lora_low_mem.py +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -2,7 +2,7 @@ Usage: python merge_llama_with_chinese_lora_low_mem.py \ --base_model path/to/llama/model \ - --lora_model path/to/first/lora/model[,path/to/second/lora/model] \ + --lora_model path/to/first/lora[,path/to/second/lora] \ --output_type [pth|huggingface] \ --output_dir path/to/output/dir """ @@ -12,19 +12,20 @@ import gc import torch import peft -from transformers import LlamaConfig, LlamaTokenizer +from transformers import LlamaTokenizer from transformers.modeling_utils import dtype_byte_size from huggingface_hub import snapshot_download import re parser = argparse.ArgumentParser() parser.add_argument('--base_model', default=None, required=True, - type=str, help="Please specify a base_model") + type=str, help="Please specify a base model.") parser.add_argument('--lora_model', default=None, required=True, - type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models.") + type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models") parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], type=str, - help="save the merged model in pth or huggingface format.") -parser.add_argument('--output_dir', default='./', type=str) + help="Save the merged model in pth or huggingface format") +parser.add_argument('--output_dir', default='./merged_model', type=str) +parser.add_argument('--verbose', default=False, action='store_true', help="Show detailed messages") emb_to_model_size = { @@ -121,7 +122,7 @@ def unpermute(w): ) -def save_shards(model_sd, num_shards: int, prefix=""): +def save_shards(model_sd, num_shards: int, prefix="", verbose=False): # Add the no_grad context manager with torch.no_grad(): if num_shards == 1: @@ -144,11 +145,9 @@ def save_shards(model_sd, num_shards: int, prefix=""): new_k = translate_state_dict_key(k) if new_k is not None: if new_k=='tok_embeddings.weight': - print(f"Processing {new_k}") assert v.size(1)%num_shards==0 splits = v.split(v.size(1)//num_shards,dim=1) elif new_k=='output.weight': - print(f"Processing {new_k}") if v.size(0)%num_shards==0: splits = v.split(v.size(0)//num_shards,dim=0) else: @@ -156,42 +155,35 @@ def save_shards(model_sd, num_shards: int, prefix=""): size_list[-1] += v.size(0)%num_shards splits = v.split(size_list, dim=0) # 13B: size_list == [24976,24977] elif new_k=='norm.weight': - print(f"Processing {new_k}") splits = [v] * num_shards elif 'ffn_norm.weight' in new_k: - print(f"Processing {new_k}") splits = [v] * num_shards elif 'attention_norm.weight' in new_k: - print(f"Processing {new_k}") splits = [v] * num_shards elif 'w1.weight' in new_k: - print(f"Processing {new_k}") splits = v.split(v.size(0)//num_shards,dim=0) elif 'w2.weight' in new_k: - print(f"Processing {new_k}") splits = v.split(v.size(1)//num_shards,dim=1) elif 'w3.weight' in new_k: - print(f"Processing {new_k}") splits = v.split(v.size(0)//num_shards,dim=0) elif 'wo.weight' in new_k: - print(f"Processing {new_k}") splits = v.split(v.size(1)//num_shards,dim=1) elif 'wv.weight' in new_k: - print(f"Processing {new_k}") splits = v.split(v.size(0)//num_shards,dim=0) elif "wq.weight" in new_k or "wk.weight" in new_k: - print(f"Processing {new_k}") v = unpermute(v) splits = v.split(v.size(0)//num_shards,dim=0) else: print(f"Unexpected key {new_k}") raise ValueError + if verbose: + print(f"Processing {new_k}") for sd,split in zip(new_state_dicts,splits): sd[new_k] = split.clone() del split @@ -248,6 +240,12 @@ def merge_shards(output_dir, num_shards: int): tokenizer = LlamaTokenizer.from_pretrained(lora_model_path) lora_config = peft.LoraConfig.from_pretrained(lora_model_path) lora_state_dict = torch.load(os.path.join(lora_model_path,'adapter_model.bin'),map_location='cpu') + if 'base_model.model.model.embed_tokens.weight' in lora_state_dict: + lora_vocab_size = lora_state_dict['base_model.model.model.embed_tokens.weight'].shape[0] + assert lora_vocab_size==len(tokenizer), \ + (f"The vocab size of the tokenizer {len(tokenizer)} does not match the vocab size of the LoRA weight {lora_vocab_size}.\n" + "Did you misuse the LLaMA tokenizer with the Alpaca-LoRA weight?\n" + "Make sure that you use LLaMA tokenizer with the LLaMA-LoRA weight and Alpaca tokenizer with the Alpaca-LoRA weight!") tokenizers_and_loras.append( { "tokenizer" :tokenizer, @@ -256,6 +254,13 @@ def merge_shards(output_dir, num_shards: int): "scaling": lora_config.lora_alpha / lora_config.r, "fan_in_fan_out" : lora_config.fan_in_fan_out, }) + if len(tokenizers_and_loras)==2: + t1_vocab_size = len(tokenizers_and_loras[0]["tokenizer"]) + t2_vocab_size = len(tokenizers_and_loras[1]["tokenizer"]) + assert t1_vocab_size<=t2_vocab_size, \ + (f"The vocab size of the first tokenizer is {t1_vocab_size}\n" + f"The vocab size of the second tokenizer is {t2_vocab_size}, found to be smaller than {t1_vocab_size}\n" + "This is not the intended use. Please check your model and tokenizer.") if not os.path.exists(base_model_path): print("Cannot find lora model on the disk. Downloading lora model from hub...") @@ -282,16 +287,19 @@ def merge_shards(output_dir, num_shards: int): dims_per_head = dim // n_heads base = 10000.0 inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) + print("Merging...") for k in state_dict: for ti, tandl in enumerate(tokenizers_and_loras): saved_key = 'base_model.model.'+k lora_key_A = saved_key.replace('.weight','.lora_A.weight') if saved_key in tandl['state_dict']: - print(f"copying {saved_key} from {ti}-th LoRA weight to {k}") + if args.verbose: + print(f"copying {saved_key} from {ti}-th LoRA weight to {k}") state_dict[k] = tandl['state_dict'][saved_key].half().clone() # do we need half()? if lora_key_A in tandl['state_dict']: lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight') - print(f"merging {lora_key_A} and lora_B.weight form {ti}-th LoRA weight to {k}") + if args.verbose: + print(f"merging {lora_key_A} and lora_B.weight form {ti}-th LoRA weight to {k}") state_dict[k] += ( transpose( tandl['state_dict'][lora_key_B].float() @@ -308,8 +316,8 @@ def merge_shards(output_dir, num_shards: int): print(f"Saving ckpt {filename} to {output_dir} in HF format...") torch.save(state_dict,os.path.join(output_dir, filename)) elif output_type=='pth': - print(f"Saving ckpt {filename} to {output_dir} in pth format...") - save_shards(model_sd=state_dict, num_shards=num_shards,prefix=f"L{index+1}-") + print(f"Converting to pth format...") + save_shards(model_sd=state_dict, num_shards=num_shards,prefix=f"L{index+1}-", verbose=args.verbose) del state_dict gc.collect() # Effectively enforce garbage collection From 8c7322c7779450e5eaab9afc2e183d9b91d06c5b Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Fri, 16 Jun 2023 00:11:39 +0800 Subject: [PATCH 5/9] remove comments --- scripts/merge_llama_with_chinese_lora_low_mem.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py index 2291f79..be48a01 100644 --- a/scripts/merge_llama_with_chinese_lora_low_mem.py +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -308,10 +308,6 @@ def merge_shards(output_dir, num_shards: int): weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype) total_size += weight_size - # did we do anything? - # assert not torch.allclose(first_weight_old, first_weight) - # first_weight = base_model.model.layers[0].self_attn.q_proj.weight - # first_weight_old = first_weight.clone() if output_type=='huggingface': print(f"Saving ckpt {filename} to {output_dir} in HF format...") torch.save(state_dict,os.path.join(output_dir, filename)) From 2ee6100a624eb647b3a7cbbea73de48d3b2257c3 Mon Sep 17 00:00:00 2001 From: yaoxin <35353688+iMountTai@users.noreply.github.com> Date: Fri, 16 Jun 2023 10:10:02 +0800 Subject: [PATCH 6/9] Update merge_llama_with_chinese_lora_low_mem.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 报错信息容易引起混淆,可删除 --- scripts/merge_llama_with_chinese_lora_low_mem.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py index be48a01..e6ab3aa 100644 --- a/scripts/merge_llama_with_chinese_lora_low_mem.py +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -244,7 +244,6 @@ def merge_shards(output_dir, num_shards: int): lora_vocab_size = lora_state_dict['base_model.model.model.embed_tokens.weight'].shape[0] assert lora_vocab_size==len(tokenizer), \ (f"The vocab size of the tokenizer {len(tokenizer)} does not match the vocab size of the LoRA weight {lora_vocab_size}.\n" - "Did you misuse the LLaMA tokenizer with the Alpaca-LoRA weight?\n" "Make sure that you use LLaMA tokenizer with the LLaMA-LoRA weight and Alpaca tokenizer with the Alpaca-LoRA weight!") tokenizers_and_loras.append( { @@ -335,4 +334,4 @@ def merge_shards(output_dir, num_shards: int): if config=='pytorch_model.bin.index.json': obj['metadata']['total_size'] = total_size json.dump(obj, open(os.path.join(output_dir, config),'w'), indent=2) - print("Done.") \ No newline at end of file + print("Done.") From d2086171d3823b53b8f0948ab7e256ddd9b180c9 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Fri, 16 Jun 2023 10:58:08 +0800 Subject: [PATCH 7/9] improve naming --- scripts/merge_llama_with_chinese_lora_low_mem.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py index e6ab3aa..166968b 100644 --- a/scripts/merge_llama_with_chinese_lora_low_mem.py +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -123,7 +123,9 @@ def unpermute(w): def save_shards(model_sd, num_shards: int, prefix="", verbose=False): - # Add the no_grad context manager + """ + Convert and save the HF format weights to PTH format weights + """ with torch.no_grad(): if num_shards == 1: new_state_dict = {} @@ -288,17 +290,17 @@ def merge_shards(output_dir, num_shards: int): inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) print("Merging...") for k in state_dict: - for ti, tandl in enumerate(tokenizers_and_loras): + for tl_idx, tandl in enumerate(tokenizers_and_loras): saved_key = 'base_model.model.'+k lora_key_A = saved_key.replace('.weight','.lora_A.weight') if saved_key in tandl['state_dict']: if args.verbose: - print(f"copying {saved_key} from {ti}-th LoRA weight to {k}") + print(f"copying {saved_key} from {tl_idx}-th LoRA weight to {k}") state_dict[k] = tandl['state_dict'][saved_key].half().clone() # do we need half()? if lora_key_A in tandl['state_dict']: lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight') if args.verbose: - print(f"merging {lora_key_A} and lora_B.weight form {ti}-th LoRA weight to {k}") + print(f"merging {lora_key_A} and lora_B.weight form {tl_idx}-th LoRA weight to {k}") state_dict[k] += ( transpose( tandl['state_dict'][lora_key_B].float() @@ -330,8 +332,10 @@ def merge_shards(output_dir, num_shards: int): for config in configs: if os.path.exists(os.path.join(base_model_path, config)): print(f"Saving {config}") - obj = json.load(open(os.path.join(base_model_path, config))) + with open(os.path.join(base_model_path, config),'r') as f: + obj = json.load(f) if config=='pytorch_model.bin.index.json': obj['metadata']['total_size'] = total_size - json.dump(obj, open(os.path.join(output_dir, config),'w'), indent=2) + with open(os.path.join(output_dir, config), 'w') as f: + json.dump(obj, f, indent=2) print("Done.") From c43f34b769d43698efccd5e2b398c2353fc51b81 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Fri, 16 Jun 2023 11:03:10 +0800 Subject: [PATCH 8/9] update help info --- .../merge_llama_with_chinese_lora_low_mem.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py index 166968b..87fbf71 100644 --- a/scripts/merge_llama_with_chinese_lora_low_mem.py +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -19,13 +19,15 @@ parser = argparse.ArgumentParser() parser.add_argument('--base_model', default=None, required=True, - type=str, help="Please specify a base model.") + type=str, help="Please specify a base model") parser.add_argument('--lora_model', default=None, required=True, type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models") -parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], type=str, - help="Save the merged model in pth or huggingface format") -parser.add_argument('--output_dir', default='./merged_model', type=str) -parser.add_argument('--verbose', default=False, action='store_true', help="Show detailed messages") +parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], + type=str, help="Save the merged model in pth or huggingface format") +parser.add_argument('--output_dir', default='./merged_model', + type=str, help="The output folder where we save the merged mdoel") +parser.add_argument('--verbose', default=False, action='store_true', + help="Show detailed messages") emb_to_model_size = { @@ -290,21 +292,21 @@ def merge_shards(output_dir, num_shards: int): inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) print("Merging...") for k in state_dict: - for tl_idx, tandl in enumerate(tokenizers_and_loras): + for tl_idx, t_and_l in enumerate(tokenizers_and_loras): saved_key = 'base_model.model.'+k lora_key_A = saved_key.replace('.weight','.lora_A.weight') - if saved_key in tandl['state_dict']: + if saved_key in t_and_l['state_dict']: if args.verbose: print(f"copying {saved_key} from {tl_idx}-th LoRA weight to {k}") - state_dict[k] = tandl['state_dict'][saved_key].half().clone() # do we need half()? - if lora_key_A in tandl['state_dict']: + state_dict[k] = t_and_l['state_dict'][saved_key].half().clone() # do we need half()? + if lora_key_A in t_and_l['state_dict']: lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight') if args.verbose: print(f"merging {lora_key_A} and lora_B.weight form {tl_idx}-th LoRA weight to {k}") state_dict[k] += ( transpose( - tandl['state_dict'][lora_key_B].float() - @ tandl['state_dict'][lora_key_A].float(), tandl['fan_in_fan_out']) * tandl['scaling'] + t_and_l['state_dict'][lora_key_B].float() + @ t_and_l['state_dict'][lora_key_A].float(), t_and_l['fan_in_fan_out']) * t_and_l['scaling'] ) weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype) total_size += weight_size From 25dba18cae809ab4aec8cb483c9ec75820e4e96c Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Fri, 16 Jun 2023 11:04:20 +0800 Subject: [PATCH 9/9] Update merge_llama_with_chinese_lora_low_mem.py --- scripts/merge_llama_with_chinese_lora_low_mem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py index 87fbf71..4c6b76c 100644 --- a/scripts/merge_llama_with_chinese_lora_low_mem.py +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -25,7 +25,7 @@ parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], type=str, help="Save the merged model in pth or huggingface format") parser.add_argument('--output_dir', default='./merged_model', - type=str, help="The output folder where we save the merged mdoel") + type=str, help="The output folder to save the merged model") parser.add_argument('--verbose', default=False, action='store_true', help="Show detailed messages")