Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add save/load for pt2e example #1927

Merged
merged 18 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
506 changes: 236 additions & 270 deletions examples/3.x_api/pytorch/cv/static_quant/main.py

Large diffs are not rendered by default.

103 changes: 103 additions & 0 deletions examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_benchmark

}

# init params
function init_params {
iters=100
batch_size=16
tuned_checkpoint=saved_results
echo ${max_eval_samples}
for var in "$@"
do
case $var in
--topology=*)
topology=$(echo $var |cut -f2 -d=)
;;
--dataset_location=*)
dataset_location=$(echo $var |cut -f2 -d=)
;;
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
--mode=*)
mode=$(echo $var |cut -f2 -d=)
;;
--batch_size=*)
batch_size=$(echo $var |cut -f2 -d=)
;;
--iters=*)
iters=$(echo ${var} |cut -f2 -d=)
;;
--int8=*)
int8=$(echo ${var} |cut -f2 -d=)
;;
--config=*)
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done

}


# run_benchmark
function run_benchmark {
extra_cmd=''

if [[ ${mode} == "accuracy" ]]; then
mode_cmd=" --accuracy "
elif [[ ${mode} == "performance" ]]; then
mode_cmd=" --performance --iters "${iters}
else
echo "Error: No such mode: ${mode}"
exit 1
fi
if [[ ${int8} == "true" ]]; then
extra_cmd=$extra_cmd" --int8"
fi
echo $extra_cmd


echo $extra_cmd

if [ "${topology}" = "resnet18_pt2e_static" ]; then
model_name_or_path="resnet18"
fi

if [[ ${mode} == "accuracy" ]]; then
python main.py \
--pretrained \
-a resnet18 \
-b 30 \
--tuned_checkpoint ${tuned_checkpoint} \
${dataset_location} \
${extra_cmd} \
${mode_cmd}
elif [[ ${mode} == "performance" ]]; then
incbench --num_cores_per_instance 4 \
main.py \
--pretrained \
-a resnet18 \
-b 30 \
--tuned_checkpoint ${tuned_checkpoint} \
${dataset_location} \
${extra_cmd} \
${mode_cmd}
else
echo "Error: No such mode: ${mode}"
exit 1
fi
}

main "$@"
9 changes: 8 additions & 1 deletion examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ function main {

# init params
function init_params {
tuned_checkpoint="saved_results"
for var in "$@"
do
case $var in
Expand Down Expand Up @@ -39,7 +40,13 @@ function run_tuning {
if [ "${topology}" = "resnet18_pt2e_static" ]; then
model_name_or_path="resnet18"
fi
python main.py -a ${model_name_or_path} ${dataset_location} -q -e
python main.py \
--pretrained \
-t \
-a resnet18 \
-b 30 \
--tuned_checkpoint ${tuned_checkpoint} \
${dataset_location}
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_benchmark

}

# init params
function init_params {
iters=100
batch_size=16
tuned_checkpoint=saved_results
task=lambada_openai
echo ${max_eval_samples}
for var in "$@"
do
case $var in
--topology=*)
topology=$(echo $var |cut -f2 -d=)
;;
--dataset_location=*)
dataset_location=$(echo $var |cut -f2 -d=)
;;
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
--mode=*)
mode=$(echo $var |cut -f2 -d=)
;;
--batch_size=*)
batch_size=$(echo $var |cut -f2 -d=)
;;
--iters=*)
iters=$(echo ${var} |cut -f2 -d=)
;;
--int8=*)
int8=$(echo ${var} |cut -f2 -d=)
;;
--config=*)
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done

}


# run_benchmark
function run_benchmark {
extra_cmd=''

if [[ ${mode} == "accuracy" ]]; then
mode_cmd=" --accuracy "
extra_cmd=$extra_cmd
elif [[ ${mode} == "performance" ]]; then
mode_cmd=" --performance --iters "${iters}
extra_cmd=$extra_cmd
else
echo "Error: No such mode: ${mode}"
exit 1
fi

if [[ ${int8} == "true" ]]; then
extra_cmd=$extra_cmd" --int8"
fi
echo $extra_cmd

echo $extra_cmd

if [ "${topology}" = "opt_125m_pt2e_static" ]; then
model_name_or_path="facebook/opt-125m"
fi
if [[ ${mode} == "accuracy" ]]; then
python -u run_clm_no_trainer.py \
--model ${model_name_or_path} \
--output_dir ${tuned_checkpoint} \
--task ${task} \
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
elif [[ ${mode} == "performance" ]]; then
incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
--model ${model_name_or_path} \
--batch_size ${batch_size} \
--output_dir ${tuned_checkpoint} \
${extra_cmd} ${mode_cmd}
else
echo "Error: No such mode: ${mode}"
exit 1
fi
}

main "$@"
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"--revision", default=None,
help="Transformers parameter: set the model hub commit number")
parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
parser.add_argument("--output_dir", nargs="?", default="./saved_results")
parser.add_argument("--output_dir", nargs="?", default="")
parser.add_argument("--quantize", action="store_true")
parser.add_argument("--approach", type=str, default='static',
help="Select from ['dynamic', 'static', 'weight-only']")
Expand Down Expand Up @@ -80,7 +80,7 @@ def get_example_inputs(tokenizer):
dynamic_shapes = {"input_ids": (batch, seq_len)}
example_inputs = get_example_inputs(tokenizer)
exported_model = export(user_model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes)

quant_config = get_default_static_config()
# prepare
prepare_model = prepare(exported_model, quant_config)
Expand All @@ -90,17 +90,32 @@ def get_example_inputs(tokenizer):
prepare_model(*example_inputs)
# convert
converted_model = convert(prepare_model)
# inference
from torch._inductor import config

# save
if args.output_dir:
converted_model.save(example_inputs=example_inputs, output_dir = args.output_dir)



if args.int8:
if args.output_dir:
print("Load int8 model.")
from neural_compressor.torch.quantization import load
model = load(args.output_dir)

config.freezing = True
opt_model = torch.compile(converted_model)
model.config = user_model.config # for lm eval

# Compile the quantized model and replace the Q/DQ pattern with Q-operator
from torch._inductor import config

opt_model.config = user_model.config # for lm eval
user_model = opt_model
config.freezing = True
opt_model = torch.compile(model)

opt_model.config = user_model.config # for lm eval
user_model = opt_model

if args.accuracy:

from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
Expand All @@ -120,29 +135,21 @@ def get_example_inputs(tokenizer):
print('Batch size = %d' % args.batch_size)

if args.performance:
# user_model.eval()
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
batch_size, input_leng = args.batch_size, 512
example_inputs = torch.ones((batch_size, input_leng), dtype=torch.long)
print("Batch size = {:d}".format(batch_size))
print("The length of input tokens = {:d}".format(input_leng))
import time

samples = args.iters * args.batch_size
eval_args = LMEvalParser(
model="hf",
user_model=user_model,
tokenizer=tokenizer,
batch_size=args.batch_size,
tasks=args.tasks,
limit=samples,
device="cpu",
)
start = time.time()
results = evaluate(eval_args)
end = time.time()
for task_name in args.tasks.split(","):
if task_name == "wikitext":
acc = results["results"][task_name]["word_perplexity,none"]
else:
acc = results["results"][task_name]["acc,none"]
print("Accuracy: %.5f" % acc)
print('Throughput: %.3f samples/sec' % (samples / (end - start)))
print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
print('Batch size = %d' % args.batch_size)
total_iters = args.iters
warmup_iters = 5
with torch.no_grad():
for i in range(total_iters):
if i == warmup_iters:
start = time.time()
user_model(example_inputs)
end = time.time()
latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
print("Latency: {:.3f} ms".format(latency * 10**3))
print("Throughput: {:.3f} samples/sec".format(throughput))
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ function run_tuning {

if [ "${topology}" = "opt_125m_pt2e_static" ]; then
model_name_or_path="facebook/opt-125m"
output_dir="saved_results"
fi
python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai"
python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --output_dir ${output_dir} --tasks "lambada_openai"
}

main "$@"
2 changes: 2 additions & 0 deletions neural_compressor/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from neural_compressor.common.utils import (
level,
level_name,
logger,
Logger,
TuningLogger,
Expand All @@ -31,6 +32,7 @@
__all__ = [
"options",
"level",
"level_name",
"logger",
"Logger",
"TuningLogger",
Expand Down
2 changes: 2 additions & 0 deletions neural_compressor/common/utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

__all__ = [
"level",
"level_name",
"Logger", # TODO: not expose it
"logger",
"TuningLogger",
Expand Down Expand Up @@ -138,6 +139,7 @@ def warning(msg, *args, **kwargs):


level = Logger().get_logger().level
level_name = logging.getLevelName(level)

logger = Logger

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ def transformation(gm: torch.fx.GraphModule, node_candidate_list: List[str], tar
for pattern_pair in HALF_PRECISION_PATTERN_REGISTRY[target_dtype].values():
apply_single_pattern_pair(gm, pattern_pair, node_candidate_list)
utils.logger.info("Half precision conversion is done:")
gm.print_readable(True)
if utils.level_name == "DEBUG": # pragma: no cover
gm.print_readable(True)


# =============================================================================
Expand Down
3 changes: 2 additions & 1 deletion neural_compressor/torch/algorithms/pt2e_quant/save_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def save(model, example_inputs, output_dir="./saved_results"):
os.makedirs(output_dir, exist_ok=True)
qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
quantized_ep = torch.export.export(model, example_inputs)
dynamic_shapes = model.dynamic_shapes
quantized_ep = torch.export.export(model, example_inputs, dynamic_shapes=dynamic_shapes)
torch.export.save(quantized_ep, qmodel_file_path)
for key, op_config in model.qconfig.items():
model.qconfig[key] = op_config.to_dict()
Expand Down
Loading
Loading