forked from turboderp/exllamav2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert.py
292 lines (219 loc) · 9.86 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Tokenizer
import argparse, os, shutil
import sys
import json
from conversion.tokenize import tokenize
from conversion.quantize import embeddings, measure_quant, quant
from conversion.optimize import optimize
from conversion.compile import compile_model
from conversion.qparams import qparams_headoptions
# import tracemalloc
# tracemalloc.start()
parser = argparse.ArgumentParser(description = "Convert model to ExLlamaV2")
parser.add_argument("-i", "--in_dir", type = str, help = "Input directory", default = "")
parser.add_argument("-o", "--out_dir", type = str, help = "Output (working) directory")
parser.add_argument("-nr", "--no_resume", action = "store_true", help = "Do not resume an interrupted job (deletes all files in the output directory)")
parser.add_argument("-cf", "--compile_full", type = str, help = "Output folder for compiled model with all config/tokenizer files")
parser.add_argument("-om", "--output_measurement", type = str, help = "Only perform measurement pass, then save measurement to the specified file")
parser.add_argument("-c", "--cal_dataset", type = str, help = "Calibration dataset (.parquet file)", default = "")
parser.add_argument("-r", "--dataset_rows", type = int, default = 100, help = "Number of rows to apply from dataset")
parser.add_argument("-mr", "--measurement_rows", type = int, default = 16, help = "Number of rows to apply from dataset when measuring")
parser.add_argument("-gr", "--gpu_rows", type = int, default = 0, help = "Threshold for paging hidden state to CPU")
parser.add_argument("-l", "--length", type = int, default = 2048, help = "Max no. tokens per sample")
parser.add_argument("-ml", "--measurement_length", type = int, default = 2048, help = "Max no. tokens per sample when measuring")
parser.add_argument("-b", "--bits", type = float, default = 4.125, help = "Target bits per weight")
parser.add_argument("-hb", "--head_bits", type = int, default = 6, help = "Target bits per weight (head layer)")
parser.add_argument("-m", "--measurement", type = str, help = "Reuse previous measurement")
parser.add_argument("-ss", "--shard_size", type = float, help = "Max shard size in MB (default: 8192)", default = 8192)
args = parser.parse_args()
# Check some args
if not args.in_dir:
print(" ## Please specify input model directory (-i, --in_dir)")
sys.exit()
if not args.out_dir:
print(" ## Please specify output/working directory (-o, --out_dir)")
sys.exit()
if not args.cal_dataset:
print(" ## Please specify dataset Parquet file (-c, --cal_dataset)")
sys.exit()
if args.length > 2048 or args.measurement_length > 2048:
print(" !! Warning: calibration rows > 2048 tokens may result in excessive VRAM use")
if not args.head_bits in qparams_headoptions:
print(f" ## Error: {args.head_bits} is not a supported option for head layer bitrate")
sys.exit()
if args.bits < 2 or args.bits > 8:
print(f" !! Warning: target bitrate {args.bits} will likely not be attainable")
if args.output_measurement is not None and args.compile_full is not None:
print(" ## Conflicting options: --output_measurement and --compile_full")
sys.exit()
# Arguments
in_dir = None if args.in_dir == "" else os.path.abspath(args.in_dir)
out_dir = os.path.abspath(args.out_dir)
cal_dataset = None if args.cal_dataset == "" else os.path.abspath(args.cal_dataset)
dataset_rows = args.dataset_rows
measurement_rows = args.measurement_rows
gpu_rows = args.gpu_rows
length = args.length
measurement_length = args.measurement_length
bits = args.bits
head_bits = args.head_bits
reuse_measurement = args.measurement
shard_size = args.shard_size if args.shard_size > 0 else 1024 ** 3 # 1 PB = unlimited
no_resume = args.no_resume
output_measurement = args.output_measurement
if output_measurement is not None:
if os.path.isdir(output_measurement):
output_measurement = os.path.join(output_measurement, "measurement.json")
compile_full = args.compile_full
if not os.path.exists(out_dir):
print(f" ## Error: Directory not found: {out_dir}")
sys.exit()
# Create config
config = ExLlamaV2Config()
config.model_dir = in_dir
config.prepare()
# Tokenizer
tokenizer = ExLlamaV2Tokenizer(config)
# Job file
job_file = os.path.join(out_dir, "job.json")
# Create new job
def save_job():
global job_file, job
with open(job_file, "w") as f:
f.write(json.dumps(job, indent = 4))
if no_resume or not os.path.exists(job_file):
print(f" -- Beginning new job")
if len(os.listdir(out_dir)) != 0:
print(f" !! Warning: Output directory is not empty: {out_dir}")
if no_resume:
print(f" !! Cleaning output directory: {out_dir}")
for filename in os.listdir(out_dir):
file_path = os.path.join(out_dir, filename)
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
if in_dir is None:
print(f" ## Error: No input directory specified")
sys.exit()
if cal_dataset is None:
print(f" ## Error: No calibration dataset specified")
sys.exit()
job = { "in_dir": in_dir,
"out_dir": out_dir,
"cal_dataset": cal_dataset,
"dataset_rows": dataset_rows,
"measurement_rows": measurement_rows,
"gpu_rows": gpu_rows,
"length": length,
"measurement_length": measurement_length,
"bits": bits,
"head_bits": head_bits,
"progress": "begin",
"shard_size": shard_size,
"output_measurement": output_measurement,
"compile_full": compile_full
}
if reuse_measurement is not None:
with open(reuse_measurement, "r") as f:
imp_measurement = json.load(f)
job["measurement"] = imp_measurement["measurement"]
job["last_module_idx"] = imp_measurement["last_module_idx"]
job["base_perplexity"] = imp_measurement["base_perplexity"]
job["reuse_measurement"] = reuse_measurement
save_job()
# Resume existing job
else:
print(f" -- Resuming job")
print(f" !! Note: Overriding options with settings from existing job")
with open(job_file, "r") as f:
job = json.load(f)
if "invalid" in job:
print(" ** Error: Corrupted job")
sys.exit()
if "shard_size" not in job: job["shard_size"] = shard_size
if "output_measurement" not in job: job["output_measurement"] = output_measurement
if "compile_full" not in job: job["compile_full"] = compile_full
job["out_dir"] = out_dir
# Feedback
print(f" -- Input: {job['in_dir']}")
print(f" -- Output: {out_dir}")
print(f" -- Calibration dataset: {job['cal_dataset']}, {job['dataset_rows']} / {job['measurement_rows']} ({job['gpu_rows']}) rows, {job['length']} tokens per sample")
if job["output_measurement"] is None:
print(f" -- Target bits per weight: {job['bits']} (decoder), {job['head_bits']} (head)")
print(f" -- Max shard size: {job['shard_size']} MB")
else:
print(f" -- Measurement will be saved to {job['output_measurement']}")
print(f" !! Conversion script will end after measurement pass")
# Make sure subfolders exist
if job["compile_full"] is not None:
print(f" -- Full model will be compiled to: {job['compile_full']}")
if os.path.exists(job["compile_full"]):
if not os.path.isdir(job["compile_full"]):
print(f" ## Error: Output path {job['compile_full']} exists but is not a directory")
sys.exit()
if len(os.listdir(job["compile_full"])) > 0:
print(f" !! Warning: Output path {job['compile_full']} exists but is not empty")
out_tensor_dir = os.path.join(job["out_dir"], "out_tensor")
if not os.path.exists(out_tensor_dir):
os.makedirs(out_tensor_dir)
# Allocate space for hidden state
max_l = max(job["measurement_length"], job["length"])
config.max_input_len = max_l
config.max_attention_size = max_l ** 2
# Create model without loading weights
model = ExLlamaV2(config)
model.load(lazy = True)
# Do the things
while True:
progress = job["progress"]
if progress == "begin":
if "reuse_measurement" in job:
print(f" -- Reusing measurement: {job['reuse_measurement']}")
job["progress"] = "optimize"
save_job()
else:
print(f" -- Tokenizing samples (measurement)...")
tokenize(job, save_job, tokenizer, measure = True)
job["progress"] = "initial_embeddings"
save_job()
if progress == "initial_embeddings":
print(f" -- Token embeddings (measurement)...")
embeddings(job, save_job, model)
job["progress"] = "measure_quant"
save_job()
if progress == "measure_quant":
print(f" -- Measuring quantization impact...")
measure_quant(job, save_job, model)
if job["output_measurement"] is None:
job["progress"] = "optimize"
else:
job["progress"] = "finished"
save_job()
if progress == "optimize":
print(f" -- Optimizing...")
optimize(job, save_job)
job["progress"] = "tokens_cal"
save_job()
if progress == "tokens_cal":
print(f" -- Tokenizing samples...")
tokenize(job, save_job, tokenizer)
job["progress"] = "embeddings"
save_job()
if progress == "embeddings":
print(f" -- Token embeddings again...")
embeddings(job, save_job, model)
job["progress"] = "quant"
save_job()
if progress == "quant":
print(f" -- Quantizing...")
quant(job, save_job, model)
job["progress"] = "compile"
save_job()
if progress == "compile":
print(f" -- Compiling output file...")
compile_model(job, save_job, model)
job["progress"] = "finished"
save_job()
if progress == "finished": break
print(f" -- Finished")