From 94c09e402adcdff60c235f603022fc74a6e123a5 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 14 Feb 2024 03:52:23 -0800 Subject: [PATCH 01/20] turbine tank --- models/requirements.txt | 2 + .../custom_models/sd_inference/clip.py | 32 ++ .../custom_models/sd_inference/unet.py | 31 ++ .../custom_models/sd_inference/utils.py | 2 +- .../custom_models/sd_inference/vae.py | 31 ++ .../custom_models/stateless_llama.py | 42 +- .../turbine_models/turbine_tank/run_models.py | 404 ++++++++++++++++++ .../turbine_tank/turbine_tank.py | 143 +++++++ 8 files changed, 681 insertions(+), 6 deletions(-) create mode 100644 models/turbine_models/turbine_tank/run_models.py create mode 100644 models/turbine_models/turbine_tank/turbine_tank.py diff --git a/models/requirements.txt b/models/requirements.txt index 4d2d16a56..99678eb68 100644 --- a/models/requirements.txt +++ b/models/requirements.txt @@ -5,3 +5,5 @@ transformers accelerate diffusers==0.24.0 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b +# turbine tank downloading/uploading +azure-storage-blob diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py index 996d5fb83..a2ab030ef 100644 --- a/models/turbine_models/custom_models/sd_inference/clip.py +++ b/models/turbine_models/custom_models/sd_inference/clip.py @@ -16,6 +16,7 @@ import torch import torch._dynamo as dynamo from transformers import CLIPTextModel, CLIPTokenizer +from turbine_models.turbine_tank import turbine_tank import argparse @@ -46,6 +47,18 @@ help="Specify vulkan target triple or rocm/cuda target device.", ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") +parser.add_argument( + "--download_ir", + action=argparse.BooleanOptionalAction, + default=True, + help="download IR from turbine tank", +) +parser.add_argument( + "--upload_ir", + action=argparse.BooleanOptionalAction, + default=False, + help="upload IR to turbine tank", +) def export_clip_model( @@ -57,6 +70,8 @@ def export_clip_model( device=None, target_triple=None, max_alloc=None, + download_ir=False, + upload_ir=False, ): # Load the tokenizer and text encoder to tokenize and encode the text. tokenizer = CLIPTokenizer.from_pretrained( @@ -64,6 +79,10 @@ def export_clip_model( subfolder="tokenizer", token=hf_auth_token, ) + + if download_ir: + return turbine_tank.downloadModelArtifacts(hf_model_name + "-clip"), tokenizer + text_encoder_model = CLIPTextModel.from_pretrained( hf_model_name, subfolder="text_encoder", @@ -94,6 +113,15 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): module_str = str(CompiledModule.get_mlir_module(inst)) safe_name = utils.create_safe_name(hf_model_name, "-clip") + if upload_ir: + with open(f"{safe_name}.mlir", "w+") as f: + f.write(module_str) + model_name_upload = hf_model_name.replace("/", "_") + model_name_upload += "-clip" + turbine_tank.uploadToBlobStorage( + str(os.path.abspath(f"{safe_name}.mlir")), + f"{model_name_upload}/{model_name_upload}.mlir", + ) if compile_to != "vmfb": return module_str, tokenizer else: @@ -102,6 +130,8 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): if __name__ == "__main__": args = parser.parse_args() + if args.upload_ir and args.download_ir: + raise ValueError("upload_ir and download_ir can't both be true") mod_str, _ = export_clip_model( args.hf_model_name, args.hf_auth_token, @@ -111,6 +141,8 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): args.device, args.iree_target_triple, args.vulkan_max_allocation, + args.download_ir, + args.upload_ir, ) safe_name = args.hf_model_name.split("/")[-1].strip() safe_name = re.sub("-", "_", safe_name) diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py index 272c7af7f..d193ded78 100644 --- a/models/turbine_models/custom_models/sd_inference/unet.py +++ b/models/turbine_models/custom_models/sd_inference/unet.py @@ -18,6 +18,7 @@ import safetensors import argparse +from turbine_models.turbine_tank import turbine_tank parser = argparse.ArgumentParser() parser.add_argument( @@ -53,6 +54,18 @@ help="Specify vulkan target triple or rocm/cuda target device.", ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") +parser.add_argument( + "--download_ir", + action=argparse.BooleanOptionalAction, + default=True, + help="download IR from turbine tank", +) +parser.add_argument( + "--upload_ir", + action=argparse.BooleanOptionalAction, + default=False, + help="upload IR to turbine tank", +) class UnetModel(torch.nn.Module): @@ -90,7 +103,12 @@ def export_unet_model( device=None, target_triple=None, max_alloc=None, + download_ir=False, + upload_ir=False, ): + if download_ir: + return turbine_tank.downloadModelArtifacts(hf_model_name + "-unet") + mapper = {} utils.save_external_weights( mapper, unet_model, external_weights, external_weight_path @@ -125,6 +143,15 @@ def main( module_str = str(CompiledModule.get_mlir_module(inst)) safe_name = utils.create_safe_name(hf_model_name, "-unet") + if upload_ir: + with open(f"{safe_name}.mlir", "w+") as f: + f.write(module_str) + model_name_upload = hf_model_name.replace("/", "_") + model_name_upload += "-unet" + turbine_tank.uploadToBlobStorage( + str(os.path.abspath(f"{safe_name}.mlir")), + f"{model_name_upload}/{model_name_upload}.mlir", + ) if compile_to != "vmfb": return module_str else: @@ -133,6 +160,8 @@ def main( if __name__ == "__main__": args = parser.parse_args() + if args.upload_ir and args.download_ir: + raise ValueError("upload_ir and download_ir can't both be true") unet_model = UnetModel( args.hf_model_name, args.hf_auth_token, @@ -150,6 +179,8 @@ def main( args.device, args.iree_target_triple, args.vulkan_max_allocation, + args.download_ir, + args.upload_ir, ) safe_name = utils.create_safe_name(args.hf_model_name, "-unet") with open(f"{safe_name}.mlir", "w+") as f: diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py index 37787fd3a..c4898dac7 100644 --- a/models/turbine_models/custom_models/sd_inference/utils.py +++ b/models/turbine_models/custom_models/sd_inference/utils.py @@ -79,7 +79,7 @@ def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name): with open(f"{safe_name}.vmfb", "wb+") as f: f.write(flatbuffer_blob) print("Saved to", safe_name + ".vmfb") - exit() + return def create_safe_name(hf_model_name, model_name_str): diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py index 03ef85556..2aef05bcf 100644 --- a/models/turbine_models/custom_models/sd_inference/vae.py +++ b/models/turbine_models/custom_models/sd_inference/vae.py @@ -18,6 +18,7 @@ import safetensors import argparse +from turbine_models.turbine_tank import turbine_tank parser = argparse.ArgumentParser() parser.add_argument( @@ -54,6 +55,18 @@ ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") parser.add_argument("--variant", type=str, default="decode") +parser.add_argument( + "--download_ir", + action=argparse.BooleanOptionalAction, + default=True, + help="download IR from turbine tank", +) +parser.add_argument( + "--upload_ir", + action=argparse.BooleanOptionalAction, + default=False, + help="upload IR to turbine tank", +) class VaeModel(torch.nn.Module): @@ -89,7 +102,12 @@ def export_vae_model( target_triple=None, max_alloc=None, variant="decode", + download_ir=False, + upload_ir=False, ): + if download_ir: + return turbine_tank.downloadModelArtifacts(hf_model_name + "-" + variant) + mapper = {} utils.save_external_weights( mapper, vae_model, external_weights, external_weight_path @@ -113,6 +131,15 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): module_str = str(CompiledModule.get_mlir_module(inst)) safe_name = utils.create_safe_name(hf_model_name, "-vae") + if upload_ir: + with open(f"{safe_name}.mlir", "w+") as f: + f.write(module_str) + model_name_upload = hf_model_name.replace("/", "_") + model_name_upload = model_name_upload + "-" + variant + turbine_tank.uploadToBlobStorage( + str(os.path.abspath(f"{safe_name}.mlir")), + f"{model_name_upload}/{model_name_upload}.mlir", + ) if compile_to != "vmfb": return module_str else: @@ -121,6 +148,8 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): if __name__ == "__main__": args = parser.parse_args() + if args.upload_ir and args.download_ir: + raise ValueError("upload_ir and download_ir can't both be true") vae_model = VaeModel( args.hf_model_name, args.hf_auth_token, @@ -139,6 +168,8 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): args.iree_target_triple, args.vulkan_max_allocation, args.variant, + args.download_ir, + args.upload_ir, ) safe_name = utils.create_safe_name(args.hf_model_name, "-vae") with open(f"{safe_name}.mlir", "w+") as f: diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py index 762690603..5e4c7ca1a 100644 --- a/models/turbine_models/custom_models/stateless_llama.py +++ b/models/turbine_models/custom_models/stateless_llama.py @@ -2,6 +2,7 @@ import sys import re import json +from turbine_models.turbine_tank import turbine_tank os.environ["TORCH_LOGS"] = "dynamic" from transformers import AutoTokenizer, AutoModelForCausalLM @@ -61,6 +62,18 @@ action="store_true", help="Compile LLM with StreamingLLM optimizations", ) +parser.add_argument( + "--download_ir", + action=argparse.BooleanOptionalAction, + default=True, + help="download IR from turbine tank", +) +parser.add_argument( + "--upload_ir", + action=argparse.BooleanOptionalAction, + default=False, + help="upload IR to turbine tank", +) def generate_schema(num_layers): @@ -107,7 +120,18 @@ def export_transformer_model( vulkan_max_allocation=None, streaming_llm=False, vmfb_path=None, + download_ir=False, + upload_ir=False, ): + tokenizer = AutoTokenizer.from_pretrained( + hf_model_name, + use_fast=False, + token=hf_auth_token, + ) + + if download_ir: + return turbine_tank.downloadModelArtifacts(hf_model_name), tokenizer + mod = AutoModelForCausalLM.from_pretrained( hf_model_name, torch_dtype=torch.float, @@ -121,11 +145,7 @@ def export_transformer_model( if precision == "f16": mod = mod.half() dtype = torch.float16 - tokenizer = AutoTokenizer.from_pretrained( - hf_model_name, - use_fast=False, - token=hf_auth_token, - ) + # TODO: generate these values instead of magic numbers NUM_LAYERS = mod.config.num_hidden_layers HEADS = getattr(mod.config, "num_key_value_heads", None) @@ -319,6 +339,14 @@ def evict_kvcache_space(self): module_str = str(CompiledModule.get_mlir_module(inst)) safe_name = hf_model_name.split("/")[-1].strip() safe_name = re.sub("-", "_", safe_name) + if upload_ir: + with open(f"{safe_name}.mlir", "w+") as f: + f.write(module_str) + model_name_upload = hf_model_name.replace("/", "_") + turbine_tank.uploadToBlobStorage( + str(os.path.abspath(f"{safe_name}.mlir")), + f"{model_name_upload}/{model_name_upload}.mlir", + ) if compile_to != "vmfb": return module_str, tokenizer else: @@ -382,6 +410,8 @@ def evict_kvcache_space(self): if __name__ == "__main__": args = parser.parse_args() + if args.upload_ir and args.download_ir: + raise ValueError("upload_ir and download_ir can't both be true") mod_str, _ = export_transformer_model( args.hf_model_name, args.hf_auth_token, @@ -395,6 +425,8 @@ def evict_kvcache_space(self): args.vulkan_max_allocation, args.streaming_llm, args.vmfb_path, + args.download_ir, + args.upload_ir, ) safe_name = args.hf_model_name.split("/")[-1].strip() safe_name = re.sub("-", "_", safe_name) diff --git a/models/turbine_models/turbine_tank/run_models.py b/models/turbine_models/turbine_tank/run_models.py new file mode 100644 index 000000000..5d612c4ee --- /dev/null +++ b/models/turbine_models/turbine_tank/run_models.py @@ -0,0 +1,404 @@ +# Copyright 2023 Nod Labs, Inc +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import argparse +from turbine_models.custom_models.sd_inference import ( + clip, + clip_runner, + unet, + unet_runner, + vae, + vae_runner, +) + +from turbine_models.custom_models.sd_inference import utils +import torch +import os +import turbine_models.custom_models.stateless_llama as llama +import difflib +from turbine_models.turbine_tank import turbine_tank + +parser = argparse.ArgumentParser() +parser.add_argument( + "--download_ir", + action=argparse.BooleanOptionalAction, + default=False, + help="download IR from turbine tank", +) +parser.add_argument( + "--upload_ir", + action=argparse.BooleanOptionalAction, + default=True, + help="upload IR to turbine tank", +) + +os.environ["TORCH_LOGS"] = "dynamic" +from shark_turbine.aot import * +from turbine_models.custom_models import llm_runner + +from turbine_models.gen_external_params.gen_external_params import ( + gen_external_params, +) + +DEFAULT_PROMPT = """[INST] <> +Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <> hi what are you? [/INST] +""" + + +def check_output_string(reference, output): + # Calculate and print diff + diff = difflib.unified_diff( + reference.splitlines(keepends=True), + output.splitlines(keepends=True), + fromfile="reference", + tofile="output", + lineterm="", + ) + return "".join(diff) + + +def run_llama_model(download_ir=False, upload_ir=True): + if not download_ir: + gen_external_params( + hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", + hf_auth_token=None, + ) + llama.export_transformer_model( + hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", + hf_auth_token=None, + compile_to="vmfb", + external_weights="safetensors", + # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized + quantization="int4", + precision="f16", + device="llvm-cpu", + target_triple="host", + download_ir=download_ir, + upload_ir=upload_ir, + ) + + if download_ir: + return + + torch_str_cache_path = ( + f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_f16_int4.txt" + ) + # if cached, just read + if os.path.exists(torch_str_cache_path): + with open(torch_str_cache_path, "r") as f: + torch_str = f.read() + else: + torch_str = llm_runner.run_torch_llm( + "Trelis/Llama-2-7b-chat-hf-function-calling-v2", None, DEFAULT_PROMPT + ) + + with open(torch_str_cache_path, "w") as f: + f.write(torch_str) + + turbine_str = llm_runner.run_llm( + "local-task", + DEFAULT_PROMPT, + "Llama_2_7b_chat_hf_function_calling_v2.vmfb", + "Trelis/Llama-2-7b-chat-hf-function-calling-v2", + None, + f"Llama_2_7b_chat_hf_function_calling_v2_f16_int4.safetensors", + ) + + result = check_output_string(torch_str, turbine_str) + + # clean up + os.remove("Llama_2_7b_chat_hf_function_calling_v2_f16_int4.safetensors") + os.remove("Llama_2_7b_chat_hf_function_calling_v2.vmfb") + os.remove("Llama_2_7b_chat_hf_function_calling_v2.mlir") + + return result + + +arguments = { + "hf_auth_token": None, + "hf_model_name": "CompVis/stable-diffusion-v1-4", + "batch_size": 1, + "height": 512, + "width": 512, + "run_vmfb": True, + "compile_to": None, + "external_weight_path": "", + "vmfb_path": "", + "external_weights": None, + "device": "local-task", + "iree_target_triple": "", + "vulkan_max_allocation": "4294967296", + "prompt": "a photograph of an astronaut riding a horse", + "in_channels": 4, +} + + +unet_model = unet.UnetModel( + # This is a public model, so no auth required + "CompVis/stable-diffusion-v1-4", + None, +) + +vae_model = vae.VaeModel( + # This is a public model, so no auth required + "CompVis/stable-diffusion-v1-4", + None, +) + + +def run_clip_model(download_ir=False, upload_ir=True): + clip.export_clip_model( + # This is a public model, so no auth required + "CompVis/stable-diffusion-v1-4", + None, + "vmfb", + "safetensors", + "stable_diffusion_v1_4_clip.safetensors", + "cpu", + download_ir=download_ir, + upload_ir=upload_ir, + ) + + if download_ir: + return + + arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors" + arguments["vmfb_path"] = "stable_diffusion_v1_4_clip.vmfb" + turbine = clip_runner.run_clip( + arguments["device"], + arguments["prompt"], + arguments["vmfb_path"], + arguments["hf_model_name"], + arguments["hf_auth_token"], + arguments["external_weight_path"], + ) + torch_output = clip_runner.run_torch_clip( + arguments["hf_model_name"], arguments["hf_auth_token"], arguments["prompt"] + ) + err = utils.largest_error(torch_output, turbine[0]) + if err < 9e-5: + result = "CLIP SUCCESS: " + str(err) + else: + result = "CLIP FAILURE: " + str(err) + + # clean up + os.remove("stable_diffusion_v1_4_clip.safetensors") + os.remove("stable_diffusion_v1_4_clip.vmfb") + os.remove("stable_diffusion_v1_4_clip.mlir") + + return result + + +def run_unet_model(download_ir=False, upload_ir=True): + unet.export_unet_model( + unet_model, + # This is a public model, so no auth required + "CompVis/stable-diffusion-v1-4", + arguments["batch_size"], + arguments["height"], + arguments["width"], + None, + "vmfb", + "safetensors", + "stable_diffusion_v1_4_unet.safetensors", + "cpu", + download_ir=download_ir, + upload_ir=upload_ir, + ) + + if download_ir: + return + + arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors" + arguments["vmfb_path"] = "stable_diffusion_v1_4_unet.vmfb" + sample = torch.rand( + arguments["batch_size"], + arguments["in_channels"], + arguments["height"] // 8, + arguments["width"] // 8, + dtype=torch.float32, + ) + timestep = torch.zeros(1, dtype=torch.float32) + encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32) + + turbine = unet_runner.run_unet( + arguments["device"], + sample, + timestep, + encoder_hidden_states, + arguments["vmfb_path"], + arguments["hf_model_name"], + arguments["hf_auth_token"], + arguments["external_weight_path"], + ) + torch_output = unet_runner.run_torch_unet( + arguments["hf_model_name"], + arguments["hf_auth_token"], + sample, + timestep, + encoder_hidden_states, + ) + err = utils.largest_error(torch_output, turbine) + if err < 9e-5: + result = "UNET SUCCESS: " + str(err) + else: + result = "UNET FAILURE: " + str(err) + + # clean up + os.remove("stable_diffusion_v1_4_unet.safetensors") + os.remove("stable_diffusion_v1_4_unet.vmfb") + os.remove("stable_diffusion_v1_4_unet.mlir") + + return result + + +def run_vae_decode(download_ir=False, upload_ir=True): + vae.export_vae_model( + vae_model, + # This is a public model, so no auth required + "CompVis/stable-diffusion-v1-4", + arguments["batch_size"], + arguments["height"], + arguments["width"], + None, + "vmfb", + "safetensors", + "stable_diffusion_v1_4_vae.safetensors", + "cpu", + variant="decode", + download_ir=download_ir, + upload_ir=upload_ir, + ) + + if download_ir: + return + + arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" + arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb" + example_input = torch.rand( + arguments["batch_size"], + 4, + arguments["height"] // 8, + arguments["width"] // 8, + dtype=torch.float32, + ) + turbine = vae_runner.run_vae( + arguments["device"], + example_input, + arguments["vmfb_path"], + arguments["hf_model_name"], + arguments["hf_auth_token"], + arguments["external_weight_path"], + ) + torch_output = vae_runner.run_torch_vae( + arguments["hf_model_name"], + arguments["hf_auth_token"], + "decode", + example_input, + ) + err = utils.largest_error(torch_output, turbine) + if err < 9e-5: + result = "VAE DECODE SUCCESS: " + str(err) + else: + result = "VAE DECODE FAILURE: " + str(err) + + # clean up + os.remove("stable_diffusion_v1_4_vae.safetensors") + os.remove("stable_diffusion_v1_4_vae.vmfb") + os.remove("stable_diffusion_v1_4_vae.mlir") + + return result + + +def run_vae_encode(download_ir=False, upload_ir=True): + vae.export_vae_model( + vae_model, + # This is a public model, so no auth required + "CompVis/stable-diffusion-v1-4", + arguments["batch_size"], + arguments["height"], + arguments["width"], + None, + "vmfb", + "safetensors", + "stable_diffusion_v1_4_vae.safetensors", + "cpu", + variant="encode", + download_ir=download_ir, + upload_ir=upload_ir, + ) + + if download_ir: + return + + arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" + arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb" + example_input = torch.rand( + arguments["batch_size"], + 3, + arguments["height"], + arguments["width"], + dtype=torch.float32, + ) + turbine = vae_runner.run_vae( + arguments["device"], + example_input, + arguments["vmfb_path"], + arguments["hf_model_name"], + arguments["hf_auth_token"], + arguments["external_weight_path"], + ) + torch_output = vae_runner.run_torch_vae( + arguments["hf_model_name"], + arguments["hf_auth_token"], + "encode", + example_input, + ) + err = utils.largest_error(torch_output, turbine) + if err < 2e-3: + result = "VAE ENCODE SUCCESS: " + str(err) + else: + result = "VAE ENCODE FAILURE: " + str(err) + + # clean up + os.remove("stable_diffusion_v1_4_vae.safetensors") + os.remove("stable_diffusion_v1_4_vae.vmfb") + os.remove("stable_diffusion_v1_4_vae.mlir") + + return result + + +if __name__ == "__main__": + args = parser.parse_args() + + if args.upload_ir and args.download_ir: + raise ValueError("upload_ir and download_ir can't both be true") + + if args.upload_ir: + result = "Turbine Tank Results\n" + llama_result = run_llama_model(args.download_ir, args.upload_ir) + result += llama_result + "\n" + clip_result = run_clip_model(args.download_ir, args.upload_ir) + result += clip_result + "\n" + unet_result = run_unet_model(args.download_ir, args.upload_ir) + result += unet_result + "\n" + vae_decode_result = run_vae_decode(args.download_ir, args.upload_ir) + result += vae_decode_result + "\n" + vae_encode_result = run_vae_encode(args.download_ir, args.upload_ir) + result += vae_encode_result + "\n" + f = open("daily_report.txt", "a") + f.write(result) + f.close() + turbine_tank.uploadToBlobStorage( + str(os.path.abspath("daily_report.txt")), "daily_report.txt" + ) + os.remove("daily_report.txt") + else: + run_llama_model(args.download_ir, args.upload_ir) + run_clip_model(args.download_ir, args.upload_ir) + run_unet_model(args.download_ir, args.upload_ir) + run_vae_decode(args.download_ir, args.upload_ir) + run_vae_encode(args.download_ir, args.upload_ir) diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py new file mode 100644 index 000000000..92a294e3e --- /dev/null +++ b/models/turbine_models/turbine_tank/turbine_tank.py @@ -0,0 +1,143 @@ +from azure.storage.blob import BlobServiceClient + +import subprocess +import datetime +import os +from pathlib import Path + +custom_path = os.getenv("TURBINE_TANK_CACHE_DIR") +if custom_path is not None: + if not os.path.exists(custom_path): + os.mkdir(custom_path) + + WORKDIR = custom_path + + print(f"Using {WORKDIR} as local turbine_tank cache directory.") +else: + WORKDIR = os.path.join(str(Path.home()), ".local/turbine_tank/") + print( + f"turbine_tank local cache is located at {WORKDIR} . You may change this by assigning the TURBINE_TANK_CACHE_DIR environment variable." + ) +os.makedirs(WORKDIR, exist_ok=True) + +storage_account_key = "XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==" +storage_account_name = "tankturbine" +connection_string = "DefaultEndpointsProtocol=https;AccountName=tankturbine;AccountKey=XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==;EndpointSuffix=core.windows.net" +container_name = "tankturbine" + + +def get_short_git_sha() -> str: + try: + return ( + subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]) + .decode("utf-8") + .strip() + ) + except FileNotFoundError: + return None + + +def uploadToBlobStorage(file_path, file_name): + # create our prefix (we use this to keep track of when and what version of turbine is being used) + today = str(datetime.date.today()) + commit = get_short_git_sha() + prefix = today + "_" + commit + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + blob_client = blob_service_client.get_blob_client( + container=container_name, blob=prefix + "/" + file_name + ) + blob = blob_client.from_connection_string( + conn_str=connection_string, + container_name=container_name, + blob_name=blob_client.blob_name, + ) + # we check to see if we already uploaded the blob (don't want to duplicate) + if blob.exists(): + print( + f"model artifacts have already been uploaded for {today} on the same github commit ({commit})" + ) + return + # upload to azure storage container tankturbine + with open(file_path, "rb") as data: + blob_client.upload_blob(data) + print(f"Uploaded {file_name}.") + + +def checkAndRemoveIfDownloadedOld(model_name: str, model_dir: str, prefix: str): + if os.path.isdir(model_dir) and len(os.listdir(model_dir)) == 1: + for item in os.listdir(model_dir): + item_path = os.path.join(model_dir, item) + # model artifacts already downloaded and up to date + # we check if model artifacts are behind using the prefix (day + git_sha) + if os.path.isdir(item_path) and item == prefix: + return True + # model artifacts are behind, so remove for new download + if os.path.isdir(item_path) and os.path.isfile( + os.path.join(item_path, model_name + ".mlir") + ): + os.remove(os.path.join(item_path, model_name + ".mlir")) + os.rmdir(item_path) + return False + # did not downloaded this model artifacts yet + return False + + +def download_public_folder(model_name: str, prefix: str, model_dir: str): + """Downloads a folder of blobs in azure container.""" + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + container_client = blob_service_client.get_container_client( + container=container_name + ) + blob_list = container_client.list_blobs(name_starts_with=prefix) + + # go through the blobs with our target prefix + # example prefix: "2024-02-13_26d6428/CompVis_stable-diffusion-v1-4-clip" + for blob in blob_list: + blob_client = blob_service_client.get_blob_client( + container=container_name, blob=blob.name + ) + # create path if directory doesn't exist locally + dest_path = model_dir + if not os.path.isdir(dest_path): + os.makedirs(dest_path) + # download blob into local turbine tank cache + with open( + file=os.path.join(model_dir, model_name + ".mlir"), mode="wb" + ) as sample_blob: + download_stream = blob_client.download_blob() + sample_blob.write(download_stream.readall()) + + +def downloadModelArtifacts(model_name: str) -> str: + model_name = model_name.replace("/", "_") + container_client = BlobServiceClient.from_connection_string( + connection_string + ).get_container_client(container=container_name) + blob_list = container_client.list_blobs() + # get the latest blob uploaded to turbine tank (can't use [] notation for blob_list) + for blob in blob_list: + latest_blob = blob + # get the prefix for the latest blob (2024-02-13_26d6428) + download_latest_prefix = latest_blob.name.split("/")[0] + model_dir = os.path.join(WORKDIR, model_name) + # check if we already downloaded the model artifacts for this day + commit + exists = checkAndRemoveIfDownloadedOld( + model_name=model_name, model_dir=model_dir, prefix=download_latest_prefix + ) + if exists: + print("Already downloaded most recent version") + return "NA" + # download the model artifacts (passing in the model name, path in azure storage to model artifacts, local directory to store) + download_public_folder( + model_name, + download_latest_prefix + "/" + model_name, + os.path.join(model_dir, download_latest_prefix), + ) + model_dir = os.path.join(WORKDIR, model_name + "/" + download_latest_prefix) + mlir_filename = os.path.join(model_dir, model_name + ".mlir") + print( + f"Verifying that model artifacts were downloaded successfully to {mlir_filename}..." + ) + assert os.path.exists(mlir_filename), f"MLIR not found at {mlir_filename}" + + return mlir_filename From 2139ab0633c8ea892e678da488ad1069c1a64744 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 14 Feb 2024 04:16:57 -0800 Subject: [PATCH 02/20] azure dep --- models/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/models/setup.py b/models/setup.py index cf0ed2d6b..60ae10c4c 100644 --- a/models/setup.py +++ b/models/setup.py @@ -61,5 +61,6 @@ def load_version_info(): "transformers", "accelerate", "diffusers==0.24.0", + "azure-storage-blob", ], ) From d323e630e08beee28ee1f2ae9a9b4926f4579636 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 14 Feb 2024 10:48:42 -0800 Subject: [PATCH 03/20] update compile_to_vmfb and sort for download --- .../custom_models/sd_inference/clip.py | 4 +++- .../custom_models/sd_inference/unet.py | 4 +++- .../custom_models/sd_inference/utils.py | 9 +++++++-- .../turbine_models/custom_models/sd_inference/vae.py | 4 +++- models/turbine_models/turbine_tank/turbine_tank.py | 12 ++++++++++++ 5 files changed, 28 insertions(+), 5 deletions(-) diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py index a2ab030ef..860bf0a15 100644 --- a/models/turbine_models/custom_models/sd_inference/clip.py +++ b/models/turbine_models/custom_models/sd_inference/clip.py @@ -125,7 +125,9 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): if compile_to != "vmfb": return module_str, tokenizer else: - utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name) + utils.compile_to_vmfb( + module_str, device, target_triple, max_alloc, safe_name, upload_ir + ) if __name__ == "__main__": diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py index d193ded78..2b49a1792 100644 --- a/models/turbine_models/custom_models/sd_inference/unet.py +++ b/models/turbine_models/custom_models/sd_inference/unet.py @@ -155,7 +155,9 @@ def main( if compile_to != "vmfb": return module_str else: - utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name) + utils.compile_to_vmfb( + module_str, device, target_triple, max_alloc, safe_name, upload_ir + ) if __name__ == "__main__": diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py index c4898dac7..14197dd3a 100644 --- a/models/turbine_models/custom_models/sd_inference/utils.py +++ b/models/turbine_models/custom_models/sd_inference/utils.py @@ -26,7 +26,9 @@ def largest_error(array1, array2): return max_error -def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name): +def compile_to_vmfb( + module_str, device, target_triple, max_alloc, safe_name, upload_ir=False +): flags = [ "--iree-input-type=torch", "--mlir-print-debuginfo", @@ -79,7 +81,10 @@ def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name): with open(f"{safe_name}.vmfb", "wb+") as f: f.write(flatbuffer_blob) print("Saved to", safe_name + ".vmfb") - return + if upload_ir: + return + else: + exit() def create_safe_name(hf_model_name, model_name_str): diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py index 2aef05bcf..77a2ced3a 100644 --- a/models/turbine_models/custom_models/sd_inference/vae.py +++ b/models/turbine_models/custom_models/sd_inference/vae.py @@ -143,7 +143,9 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): if compile_to != "vmfb": return module_str else: - utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name) + utils.compile_to_vmfb( + module_str, device, target_triple, max_alloc, safe_name, upload_ir + ) if __name__ == "__main__": diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py index 92a294e3e..3d06d8009 100644 --- a/models/turbine_models/turbine_tank/turbine_tank.py +++ b/models/turbine_models/turbine_tank/turbine_tank.py @@ -4,6 +4,7 @@ import datetime import os from pathlib import Path +from functools import cmp_to_key custom_path = os.getenv("TURBINE_TANK_CACHE_DIR") if custom_path is not None: @@ -108,6 +109,16 @@ def download_public_folder(model_name: str, prefix: str, model_dir: str): sample_blob.write(download_stream.readall()) +# sort blobs by last modified +def compare(item1, item2): + if item1.last_modified < item2.last_modified: + return -1 + elif item1.last_modified < item2.last_modified: + return 1 + else: + return 0 + + def downloadModelArtifacts(model_name: str) -> str: model_name = model_name.replace("/", "_") container_client = BlobServiceClient.from_connection_string( @@ -115,6 +126,7 @@ def downloadModelArtifacts(model_name: str) -> str: ).get_container_client(container=container_name) blob_list = container_client.list_blobs() # get the latest blob uploaded to turbine tank (can't use [] notation for blob_list) + blob_list = sorted(blob_list, key=cmp_to_key(compare)) for blob in blob_list: latest_blob = blob # get the prefix for the latest blob (2024-02-13_26d6428) From d584a2073ec5636518efd77fc89c0810c616e0f9 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 21 Feb 2024 02:05:45 -0800 Subject: [PATCH 04/20] update tank to add 30 models using general flow + leverage existing testing --- models/requirements.txt | 2 + models/setup.py | 1 + .../custom_models/sd_inference/clip.py | 13 - .../custom_models/sd_inference/unet.py | 12 - .../custom_models/sd_inference/utils.py | 5 +- .../custom_models/sd_inference/vae.py | 14 +- .../custom_models/stateless_llama.py | 13 - models/turbine_models/model_builder.py | 34 +- models/turbine_models/tests/sd_test.py | 8 + .../tests/stateless_llama_test.py | 3 + .../turbine_models/turbine_tank/run_models.py | 404 ------------------ .../turbine_models/turbine_tank/run_tank.py | 61 +++ .../turbine_models/turbine_tank/tank_test.py | 143 +++++++ .../turbine_models/turbine_tank/tank_util.py | 260 +++++++++++ .../turbine_tank/turbine_tank.py | 12 +- 15 files changed, 522 insertions(+), 463 deletions(-) delete mode 100644 models/turbine_models/turbine_tank/run_models.py create mode 100644 models/turbine_models/turbine_tank/run_tank.py create mode 100644 models/turbine_models/turbine_tank/tank_test.py create mode 100644 models/turbine_models/turbine_tank/tank_util.py diff --git a/models/requirements.txt b/models/requirements.txt index 99678eb68..132b52309 100644 --- a/models/requirements.txt +++ b/models/requirements.txt @@ -7,3 +7,5 @@ diffusers==0.24.0 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b # turbine tank downloading/uploading azure-storage-blob +# microsoft/phi model +einops diff --git a/models/setup.py b/models/setup.py index 60ae10c4c..7c5dcfa97 100644 --- a/models/setup.py +++ b/models/setup.py @@ -62,5 +62,6 @@ def load_version_info(): "accelerate", "diffusers==0.24.0", "azure-storage-blob", + "einops", ], ) diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py index 860bf0a15..d7ed96561 100644 --- a/models/turbine_models/custom_models/sd_inference/clip.py +++ b/models/turbine_models/custom_models/sd_inference/clip.py @@ -47,12 +47,6 @@ help="Specify vulkan target triple or rocm/cuda target device.", ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") -parser.add_argument( - "--download_ir", - action=argparse.BooleanOptionalAction, - default=True, - help="download IR from turbine tank", -) parser.add_argument( "--upload_ir", action=argparse.BooleanOptionalAction, @@ -70,7 +64,6 @@ def export_clip_model( device=None, target_triple=None, max_alloc=None, - download_ir=False, upload_ir=False, ): # Load the tokenizer and text encoder to tokenize and encode the text. @@ -80,9 +73,6 @@ def export_clip_model( token=hf_auth_token, ) - if download_ir: - return turbine_tank.downloadModelArtifacts(hf_model_name + "-clip"), tokenizer - text_encoder_model = CLIPTextModel.from_pretrained( hf_model_name, subfolder="text_encoder", @@ -132,8 +122,6 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): if __name__ == "__main__": args = parser.parse_args() - if args.upload_ir and args.download_ir: - raise ValueError("upload_ir and download_ir can't both be true") mod_str, _ = export_clip_model( args.hf_model_name, args.hf_auth_token, @@ -143,7 +131,6 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): args.device, args.iree_target_triple, args.vulkan_max_allocation, - args.download_ir, args.upload_ir, ) safe_name = args.hf_model_name.split("/")[-1].strip() diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py index 2b49a1792..f65af556c 100644 --- a/models/turbine_models/custom_models/sd_inference/unet.py +++ b/models/turbine_models/custom_models/sd_inference/unet.py @@ -54,12 +54,6 @@ help="Specify vulkan target triple or rocm/cuda target device.", ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") -parser.add_argument( - "--download_ir", - action=argparse.BooleanOptionalAction, - default=True, - help="download IR from turbine tank", -) parser.add_argument( "--upload_ir", action=argparse.BooleanOptionalAction, @@ -103,11 +97,8 @@ def export_unet_model( device=None, target_triple=None, max_alloc=None, - download_ir=False, upload_ir=False, ): - if download_ir: - return turbine_tank.downloadModelArtifacts(hf_model_name + "-unet") mapper = {} utils.save_external_weights( @@ -162,8 +153,6 @@ def main( if __name__ == "__main__": args = parser.parse_args() - if args.upload_ir and args.download_ir: - raise ValueError("upload_ir and download_ir can't both be true") unet_model = UnetModel( args.hf_model_name, args.hf_auth_token, @@ -181,7 +170,6 @@ def main( args.device, args.iree_target_triple, args.vulkan_max_allocation, - args.download_ir, args.upload_ir, ) safe_name = utils.create_safe_name(args.hf_model_name, "-unet") diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py index 14197dd3a..3d5d2a0a2 100644 --- a/models/turbine_models/custom_models/sd_inference/utils.py +++ b/models/turbine_models/custom_models/sd_inference/utils.py @@ -81,10 +81,7 @@ def compile_to_vmfb( with open(f"{safe_name}.vmfb", "wb+") as f: f.write(flatbuffer_blob) print("Saved to", safe_name + ".vmfb") - if upload_ir: - return - else: - exit() + exit() def create_safe_name(hf_model_name, model_name_str): diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py index 77a2ced3a..5d62edf1f 100644 --- a/models/turbine_models/custom_models/sd_inference/vae.py +++ b/models/turbine_models/custom_models/sd_inference/vae.py @@ -55,12 +55,6 @@ ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") parser.add_argument("--variant", type=str, default="decode") -parser.add_argument( - "--download_ir", - action=argparse.BooleanOptionalAction, - default=True, - help="download IR from turbine tank", -) parser.add_argument( "--upload_ir", action=argparse.BooleanOptionalAction, @@ -102,11 +96,8 @@ def export_vae_model( target_triple=None, max_alloc=None, variant="decode", - download_ir=False, upload_ir=False, ): - if download_ir: - return turbine_tank.downloadModelArtifacts(hf_model_name + "-" + variant) mapper = {} utils.save_external_weights( @@ -135,7 +126,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) model_name_upload = hf_model_name.replace("/", "_") - model_name_upload = model_name_upload + "-" + variant + model_name_upload = model_name_upload + "-vae-" + variant turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), f"{model_name_upload}/{model_name_upload}.mlir", @@ -150,8 +141,6 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): if __name__ == "__main__": args = parser.parse_args() - if args.upload_ir and args.download_ir: - raise ValueError("upload_ir and download_ir can't both be true") vae_model = VaeModel( args.hf_model_name, args.hf_auth_token, @@ -170,7 +159,6 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): args.iree_target_triple, args.vulkan_max_allocation, args.variant, - args.download_ir, args.upload_ir, ) safe_name = utils.create_safe_name(args.hf_model_name, "-vae") diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py index 5e4c7ca1a..3fa19c12e 100644 --- a/models/turbine_models/custom_models/stateless_llama.py +++ b/models/turbine_models/custom_models/stateless_llama.py @@ -62,12 +62,6 @@ action="store_true", help="Compile LLM with StreamingLLM optimizations", ) -parser.add_argument( - "--download_ir", - action=argparse.BooleanOptionalAction, - default=True, - help="download IR from turbine tank", -) parser.add_argument( "--upload_ir", action=argparse.BooleanOptionalAction, @@ -120,7 +114,6 @@ def export_transformer_model( vulkan_max_allocation=None, streaming_llm=False, vmfb_path=None, - download_ir=False, upload_ir=False, ): tokenizer = AutoTokenizer.from_pretrained( @@ -129,9 +122,6 @@ def export_transformer_model( token=hf_auth_token, ) - if download_ir: - return turbine_tank.downloadModelArtifacts(hf_model_name), tokenizer - mod = AutoModelForCausalLM.from_pretrained( hf_model_name, torch_dtype=torch.float, @@ -410,8 +400,6 @@ def evict_kvcache_space(self): if __name__ == "__main__": args = parser.parse_args() - if args.upload_ir and args.download_ir: - raise ValueError("upload_ir and download_ir can't both be true") mod_str, _ = export_transformer_model( args.hf_model_name, args.hf_auth_token, @@ -425,7 +413,6 @@ def evict_kvcache_space(self): args.vulkan_max_allocation, args.streaming_llm, args.vmfb_path, - args.download_ir, args.upload_ir, ) safe_name = args.hf_model_name.split("/")[-1].strip() diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py index 22139ca64..6f5c8b578 100644 --- a/models/turbine_models/model_builder.py +++ b/models/turbine_models/model_builder.py @@ -1,6 +1,9 @@ from transformers import AutoModel, AutoTokenizer, AutoConfig import torch import shark_turbine.aot as aot +from turbine_models.turbine_tank import turbine_tank +import os +import re class HFTransformerBuilder: @@ -23,6 +26,10 @@ def __init__( auto_tokenizer: AutoTokenizer = None, auto_config: AutoConfig = None, hf_auth_token=None, + upload_ir=False, + model=None, + model_type: str = None, + run_e2e: bool = None, ) -> None: self.example_input = example_input self.hf_id = hf_id @@ -30,9 +37,13 @@ def __init__( self.auto_tokenizer = auto_tokenizer self.auto_config = auto_config self.hf_auth_token = hf_auth_token - self.model = None + self.model = model self.tokenizer = None - self.build_model() + self.upload_ir = upload_ir + self.model_type = model_type + self.run_e2e = run_e2e + if self.model == None: + self.build_model() def build_model(self) -> None: """ @@ -59,6 +70,23 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule: Returns: aot.CompiledModule: The compiled module binary. """ - module = aot.export(self.model, self.example_input) + if self.model_type == "hf_seq2seq": + module = aot.export(self.model, *self.example_input) + else: + module = aot.export(self.model, self.example_input) + module_str = str(module.mlir_module) + safe_name = self.hf_id.split("/")[-1].strip() + safe_name = re.sub("-", "_", safe_name) + if self.upload_ir: + with open(f"{safe_name}.mlir", "w+") as f: + f.write(module_str) + model_name_upload = self.hf_id.replace("/", "_") + turbine_tank.uploadToBlobStorage( + str(os.path.abspath(f"{safe_name}.mlir")), + f"{model_name_upload}/{model_name_upload}.mlir", + ) + os.remove(f"{safe_name}.mlir") + if self.run_e2e is not None and self.run_e2e is False: + return compiled_binary = module.compile(save_to=save_to) return compiled_binary diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py index 125f97d82..a00e4bb2b 100644 --- a/models/turbine_models/tests/sd_test.py +++ b/models/turbine_models/tests/sd_test.py @@ -55,6 +55,7 @@ class StableDiffusionTest(unittest.TestCase): def testExportClipModel(self): + upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") with self.assertRaises(SystemExit) as cm: clip.export_clip_model( # This is a public model, so no auth required @@ -64,6 +65,7 @@ def testExportClipModel(self): "safetensors", "stable_diffusion_v1_4_clip.safetensors", "cpu", + upload_ir=upload_ir_var == "upload", ) self.assertEqual(cm.exception.code, None) arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors" @@ -85,6 +87,7 @@ def testExportClipModel(self): os.remove("stable_diffusion_v1_4_clip.vmfb") def testExportUnetModel(self): + upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") with self.assertRaises(SystemExit) as cm: unet.export_unet_model( unet_model, @@ -98,6 +101,7 @@ def testExportUnetModel(self): "safetensors", "stable_diffusion_v1_4_unet.safetensors", "cpu", + upload_ir=upload_ir_var == "upload", ) self.assertEqual(cm.exception.code, None) arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors" @@ -135,6 +139,7 @@ def testExportUnetModel(self): os.remove("stable_diffusion_v1_4_unet.vmfb") def testExportVaeModelDecode(self): + upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") with self.assertRaises(SystemExit) as cm: vae.export_vae_model( vae_model, @@ -149,6 +154,7 @@ def testExportVaeModelDecode(self): "stable_diffusion_v1_4_vae.safetensors", "cpu", variant="decode", + upload_ir=upload_ir_var == "upload", ) self.assertEqual(cm.exception.code, None) arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" @@ -180,6 +186,7 @@ def testExportVaeModelDecode(self): os.remove("stable_diffusion_v1_4_vae.vmfb") def testExportVaeModelEncode(self): + upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") with self.assertRaises(SystemExit) as cm: vae.export_vae_model( vae_model, @@ -194,6 +201,7 @@ def testExportVaeModelEncode(self): "stable_diffusion_v1_4_vae.safetensors", "cpu", variant="encode", + upload_ir=upload_ir_var == "upload", ) self.assertEqual(cm.exception.code, None) arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py index 574902101..c72c55e55 100644 --- a/models/turbine_models/tests/stateless_llama_test.py +++ b/models/turbine_models/tests/stateless_llama_test.py @@ -53,6 +53,8 @@ def test_vmfb_comparison(self): For VMFB, quantization can be int4 or None, but right now only using none for compatibility with torch. """ + upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") + llama.export_transformer_model( hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", hf_auth_token=None, @@ -63,6 +65,7 @@ def test_vmfb_comparison(self): precision=precision, device="llvm-cpu", target_triple="host", + upload_ir=upload_ir_var == "upload", ) torch_str_cache_path = f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt" diff --git a/models/turbine_models/turbine_tank/run_models.py b/models/turbine_models/turbine_tank/run_models.py deleted file mode 100644 index 5d612c4ee..000000000 --- a/models/turbine_models/turbine_tank/run_models.py +++ /dev/null @@ -1,404 +0,0 @@ -# Copyright 2023 Nod Labs, Inc -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -import argparse -from turbine_models.custom_models.sd_inference import ( - clip, - clip_runner, - unet, - unet_runner, - vae, - vae_runner, -) - -from turbine_models.custom_models.sd_inference import utils -import torch -import os -import turbine_models.custom_models.stateless_llama as llama -import difflib -from turbine_models.turbine_tank import turbine_tank - -parser = argparse.ArgumentParser() -parser.add_argument( - "--download_ir", - action=argparse.BooleanOptionalAction, - default=False, - help="download IR from turbine tank", -) -parser.add_argument( - "--upload_ir", - action=argparse.BooleanOptionalAction, - default=True, - help="upload IR to turbine tank", -) - -os.environ["TORCH_LOGS"] = "dynamic" -from shark_turbine.aot import * -from turbine_models.custom_models import llm_runner - -from turbine_models.gen_external_params.gen_external_params import ( - gen_external_params, -) - -DEFAULT_PROMPT = """[INST] <> -Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <> hi what are you? [/INST] -""" - - -def check_output_string(reference, output): - # Calculate and print diff - diff = difflib.unified_diff( - reference.splitlines(keepends=True), - output.splitlines(keepends=True), - fromfile="reference", - tofile="output", - lineterm="", - ) - return "".join(diff) - - -def run_llama_model(download_ir=False, upload_ir=True): - if not download_ir: - gen_external_params( - hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", - hf_auth_token=None, - ) - llama.export_transformer_model( - hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", - hf_auth_token=None, - compile_to="vmfb", - external_weights="safetensors", - # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized - quantization="int4", - precision="f16", - device="llvm-cpu", - target_triple="host", - download_ir=download_ir, - upload_ir=upload_ir, - ) - - if download_ir: - return - - torch_str_cache_path = ( - f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_f16_int4.txt" - ) - # if cached, just read - if os.path.exists(torch_str_cache_path): - with open(torch_str_cache_path, "r") as f: - torch_str = f.read() - else: - torch_str = llm_runner.run_torch_llm( - "Trelis/Llama-2-7b-chat-hf-function-calling-v2", None, DEFAULT_PROMPT - ) - - with open(torch_str_cache_path, "w") as f: - f.write(torch_str) - - turbine_str = llm_runner.run_llm( - "local-task", - DEFAULT_PROMPT, - "Llama_2_7b_chat_hf_function_calling_v2.vmfb", - "Trelis/Llama-2-7b-chat-hf-function-calling-v2", - None, - f"Llama_2_7b_chat_hf_function_calling_v2_f16_int4.safetensors", - ) - - result = check_output_string(torch_str, turbine_str) - - # clean up - os.remove("Llama_2_7b_chat_hf_function_calling_v2_f16_int4.safetensors") - os.remove("Llama_2_7b_chat_hf_function_calling_v2.vmfb") - os.remove("Llama_2_7b_chat_hf_function_calling_v2.mlir") - - return result - - -arguments = { - "hf_auth_token": None, - "hf_model_name": "CompVis/stable-diffusion-v1-4", - "batch_size": 1, - "height": 512, - "width": 512, - "run_vmfb": True, - "compile_to": None, - "external_weight_path": "", - "vmfb_path": "", - "external_weights": None, - "device": "local-task", - "iree_target_triple": "", - "vulkan_max_allocation": "4294967296", - "prompt": "a photograph of an astronaut riding a horse", - "in_channels": 4, -} - - -unet_model = unet.UnetModel( - # This is a public model, so no auth required - "CompVis/stable-diffusion-v1-4", - None, -) - -vae_model = vae.VaeModel( - # This is a public model, so no auth required - "CompVis/stable-diffusion-v1-4", - None, -) - - -def run_clip_model(download_ir=False, upload_ir=True): - clip.export_clip_model( - # This is a public model, so no auth required - "CompVis/stable-diffusion-v1-4", - None, - "vmfb", - "safetensors", - "stable_diffusion_v1_4_clip.safetensors", - "cpu", - download_ir=download_ir, - upload_ir=upload_ir, - ) - - if download_ir: - return - - arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_clip.vmfb" - turbine = clip_runner.run_clip( - arguments["device"], - arguments["prompt"], - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], - ) - torch_output = clip_runner.run_torch_clip( - arguments["hf_model_name"], arguments["hf_auth_token"], arguments["prompt"] - ) - err = utils.largest_error(torch_output, turbine[0]) - if err < 9e-5: - result = "CLIP SUCCESS: " + str(err) - else: - result = "CLIP FAILURE: " + str(err) - - # clean up - os.remove("stable_diffusion_v1_4_clip.safetensors") - os.remove("stable_diffusion_v1_4_clip.vmfb") - os.remove("stable_diffusion_v1_4_clip.mlir") - - return result - - -def run_unet_model(download_ir=False, upload_ir=True): - unet.export_unet_model( - unet_model, - # This is a public model, so no auth required - "CompVis/stable-diffusion-v1-4", - arguments["batch_size"], - arguments["height"], - arguments["width"], - None, - "vmfb", - "safetensors", - "stable_diffusion_v1_4_unet.safetensors", - "cpu", - download_ir=download_ir, - upload_ir=upload_ir, - ) - - if download_ir: - return - - arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_unet.vmfb" - sample = torch.rand( - arguments["batch_size"], - arguments["in_channels"], - arguments["height"] // 8, - arguments["width"] // 8, - dtype=torch.float32, - ) - timestep = torch.zeros(1, dtype=torch.float32) - encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32) - - turbine = unet_runner.run_unet( - arguments["device"], - sample, - timestep, - encoder_hidden_states, - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], - ) - torch_output = unet_runner.run_torch_unet( - arguments["hf_model_name"], - arguments["hf_auth_token"], - sample, - timestep, - encoder_hidden_states, - ) - err = utils.largest_error(torch_output, turbine) - if err < 9e-5: - result = "UNET SUCCESS: " + str(err) - else: - result = "UNET FAILURE: " + str(err) - - # clean up - os.remove("stable_diffusion_v1_4_unet.safetensors") - os.remove("stable_diffusion_v1_4_unet.vmfb") - os.remove("stable_diffusion_v1_4_unet.mlir") - - return result - - -def run_vae_decode(download_ir=False, upload_ir=True): - vae.export_vae_model( - vae_model, - # This is a public model, so no auth required - "CompVis/stable-diffusion-v1-4", - arguments["batch_size"], - arguments["height"], - arguments["width"], - None, - "vmfb", - "safetensors", - "stable_diffusion_v1_4_vae.safetensors", - "cpu", - variant="decode", - download_ir=download_ir, - upload_ir=upload_ir, - ) - - if download_ir: - return - - arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb" - example_input = torch.rand( - arguments["batch_size"], - 4, - arguments["height"] // 8, - arguments["width"] // 8, - dtype=torch.float32, - ) - turbine = vae_runner.run_vae( - arguments["device"], - example_input, - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], - ) - torch_output = vae_runner.run_torch_vae( - arguments["hf_model_name"], - arguments["hf_auth_token"], - "decode", - example_input, - ) - err = utils.largest_error(torch_output, turbine) - if err < 9e-5: - result = "VAE DECODE SUCCESS: " + str(err) - else: - result = "VAE DECODE FAILURE: " + str(err) - - # clean up - os.remove("stable_diffusion_v1_4_vae.safetensors") - os.remove("stable_diffusion_v1_4_vae.vmfb") - os.remove("stable_diffusion_v1_4_vae.mlir") - - return result - - -def run_vae_encode(download_ir=False, upload_ir=True): - vae.export_vae_model( - vae_model, - # This is a public model, so no auth required - "CompVis/stable-diffusion-v1-4", - arguments["batch_size"], - arguments["height"], - arguments["width"], - None, - "vmfb", - "safetensors", - "stable_diffusion_v1_4_vae.safetensors", - "cpu", - variant="encode", - download_ir=download_ir, - upload_ir=upload_ir, - ) - - if download_ir: - return - - arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors" - arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb" - example_input = torch.rand( - arguments["batch_size"], - 3, - arguments["height"], - arguments["width"], - dtype=torch.float32, - ) - turbine = vae_runner.run_vae( - arguments["device"], - example_input, - arguments["vmfb_path"], - arguments["hf_model_name"], - arguments["hf_auth_token"], - arguments["external_weight_path"], - ) - torch_output = vae_runner.run_torch_vae( - arguments["hf_model_name"], - arguments["hf_auth_token"], - "encode", - example_input, - ) - err = utils.largest_error(torch_output, turbine) - if err < 2e-3: - result = "VAE ENCODE SUCCESS: " + str(err) - else: - result = "VAE ENCODE FAILURE: " + str(err) - - # clean up - os.remove("stable_diffusion_v1_4_vae.safetensors") - os.remove("stable_diffusion_v1_4_vae.vmfb") - os.remove("stable_diffusion_v1_4_vae.mlir") - - return result - - -if __name__ == "__main__": - args = parser.parse_args() - - if args.upload_ir and args.download_ir: - raise ValueError("upload_ir and download_ir can't both be true") - - if args.upload_ir: - result = "Turbine Tank Results\n" - llama_result = run_llama_model(args.download_ir, args.upload_ir) - result += llama_result + "\n" - clip_result = run_clip_model(args.download_ir, args.upload_ir) - result += clip_result + "\n" - unet_result = run_unet_model(args.download_ir, args.upload_ir) - result += unet_result + "\n" - vae_decode_result = run_vae_decode(args.download_ir, args.upload_ir) - result += vae_decode_result + "\n" - vae_encode_result = run_vae_encode(args.download_ir, args.upload_ir) - result += vae_encode_result + "\n" - f = open("daily_report.txt", "a") - f.write(result) - f.close() - turbine_tank.uploadToBlobStorage( - str(os.path.abspath("daily_report.txt")), "daily_report.txt" - ) - os.remove("daily_report.txt") - else: - run_llama_model(args.download_ir, args.upload_ir) - run_clip_model(args.download_ir, args.upload_ir) - run_unet_model(args.download_ir, args.upload_ir) - run_vae_decode(args.download_ir, args.upload_ir) - run_vae_encode(args.download_ir, args.upload_ir) diff --git a/models/turbine_models/turbine_tank/run_tank.py b/models/turbine_models/turbine_tank/run_tank.py new file mode 100644 index 000000000..816d92322 --- /dev/null +++ b/models/turbine_models/turbine_tank/run_tank.py @@ -0,0 +1,61 @@ +# Copyright 2023 Nod Labs, Inc +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import argparse +import unittest +from turbine_models.turbine_tank import tank_util + +import turbine_models.tests.sd_test as sd_test +import os +from turbine_models.turbine_tank import turbine_tank + +import pytest + +parser = argparse.ArgumentParser() +parser.add_argument( + "--download_ir", + action=argparse.BooleanOptionalAction, + default=False, + help="download IR from turbine tank", +) + +if __name__ == "__main__": + args = parser.parse_args() + + if args.download_ir: + turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-clip") + turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-vae-decode") + turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-vae-encode") + turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-unet") + turbine_tank.downloadModelArtifacts( + "Trelis/Llama-2-7b-chat-hf-function-calling-v2" + ) + for model_name, _ in tank_util.model_list: + turbine_tank.downloadModelArtifacts(model_name) + else: + import turbine_models.tests.stateless_llama_test as stateless_llama_test + + # environment variable used to let the llama/sd tests know we are running from tank and want to upload + os.environ["TURBINE_TANK_ACTION"] = "upload" + + # run existing turbine llama and sd tests integrated with turbine tank + llama_suite = unittest.TestLoader().loadTestsFromModule(stateless_llama_test) + unittest.TextTestRunner(verbosity=2).run(llama_suite) + + sd_suite = unittest.TestLoader().loadTestsFromModule(sd_test) + unittest.TextTestRunner(verbosity=2).run(sd_suite) + + # cleanup + os.remove("Llama_2_7b_chat_hf_function_calling_v2_f32_unquantized.safetensors") + os.remove("Llama_2_7b_chat_hf_function_calling_v2.mlir") + os.remove("Llama_2_7b_chat_hf_function_calling_v2.vmfb") + os.remove("streaming_llama.vmfb") + os.remove("stable_diffusion_v1_4_clip.mlir") + os.remove("stable_diffusion_v1_4_unet.mlir") + os.remove("stable_diffusion_v1_4_vae.mlir") + + # runs tank_test.py (only pytest file in this directory, runs 30 models e2e) + pytest.main(["-v", os.path.dirname(os.path.abspath(__file__))]) diff --git a/models/turbine_models/turbine_tank/tank_test.py b/models/turbine_models/turbine_tank/tank_test.py new file mode 100644 index 000000000..17c8ec8ba --- /dev/null +++ b/models/turbine_models/turbine_tank/tank_test.py @@ -0,0 +1,143 @@ +import pytest +from turbine_models.turbine_tank import tank_util +from turbine_models.model_builder import HFTransformerBuilder +from turbine_models.model_runner import vmfbRunner +from turbine_models.custom_models.sd_inference import utils +from iree import runtime as ireert +import os + + +@pytest.mark.parametrize( + "model_name,model_type,expected_err,run_e2e", + [ + ("microsoft/resnet-50", "hf_img_cls", 8e-05, True), + ("bert-large-uncased", "hf", 8e-06, True), + ("facebook/deit-small-distilled-patch16-224", "hf_img_cls", 8e-05, True), + ("google/vit-base-patch16-224", "hf_img_cls", 8e-05, True), + ("microsoft/beit-base-patch16-224-pt22k-ft22k", "hf_img_cls", 8e-05, True), + ("microsoft/MiniLM-L12-H384-uncased", "hf", 5e-07, True), + ("google/mobilebert-uncased", "hf", 4.3, True), + ("mobilenet_v3_small", "vision", 6e-05, True), + ("nvidia/mit-b0", "hf_img_cls", 7.3, True), + ("resnet101", "vision", 8e-06, True), + ("resnet18", "vision", 8e-06, True), + ("resnet50", "vision", 8e-06, True), + ("squeezenet1_0", "vision", 9e-06, True), + ("wide_resnet50_2", "vision", 9e-06, True), + ("mnasnet1_0", "vision", 2e-05, True), + pytest.param( + "t5-base", + "hf_seq2seq", + -1, + False, + marks=pytest.mark.xfail(reason="iree-compile fails"), + ), + pytest.param( + "t5-large", + "hf_seq2seq", + -1, + False, + marks=pytest.mark.xfail(reason="iree-compile fails"), + ), + ("openai/whisper-base", "hf_causallm", 9e-05, True), + ("openai/whisper-small", "hf_causallm", 0.0003, True), + ("openai/whisper-medium", "hf_causallm", 0.0003, True), + ("facebook/opt-350m", "hf", 9e-07, True), + ("facebook/opt-1.3b", "hf", 9e-06, True), + ("BAAI/bge-base-en-v1.5", "hf", 9e-07, True), + pytest.param( + "facebook/bart-large", + "hf_seq2seq", + -1, + False, + marks=pytest.mark.xfail(reason="iree-compile fails"), + ), + pytest.param( + "gpt2", + "hf", + -1, + False, + marks=pytest.mark.xfail(reason="iree-compile fails"), + ), + pytest.param( + "gpt2-xl", + "hf", + -1, + False, + marks=pytest.mark.xfail(reason="iree-compile fails"), + ), + ("lmsys/vicuna-13b-v1.3", "hf", 5e-05, True), + pytest.param( + "microsoft/phi-1_5", + "hf_causallm", + -1, + True, + marks=pytest.mark.xfail(reason="correctness issue"), + ), # nan error reported (correctness issue) + pytest.param( + "microsoft/phi-2", + "hf_causallm", + -1, + True, + marks=pytest.mark.xfail(reason="correctness issue"), + ), # nan error reported (correctness issue) + pytest.param( + "mosaicml/mpt-30b", + "hf_causallm", + -1, + False, + marks=pytest.mark.xfail(reason="iree-compile fails"), + ), + ("stabilityai/stablelm-3b-4e1t", "hf_causallm", 0.0004, True), + ], +) +def test_all_models(model_name, model_type, expected_err, run_e2e): + import_args = { + "batch_size": 1, + } + # Based on the model type, get the appropriate hugging face model, inputs, and output + if model_type == "vision": + torch_model, input, out = tank_util.get_vision_model(model_name, import_args) + elif model_type == "hf": + torch_model, input, out = tank_util.get_hf_model(model_name, import_args) + elif model_type == "hf_seq2seq": + torch_model, input, out = tank_util.get_hf_seq2seq_model( + model_name, import_args + ) + elif model_type == "hf_causallm": + torch_model, input, out = tank_util.get_hf_causallm_model( + model_name, import_args + ) + elif model_type == "hf_img_cls": + torch_model, input, out = tank_util.get_hf_img_cls_model( + model_name, import_args + ) + # compile model and get vmfb + model = HFTransformerBuilder( + example_input=input, + hf_id=model_name, + hf_auth_token="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", + upload_ir=True, + model=torch_model, + model_type=model_type, + run_e2e=run_e2e, + ) + vmfb_name = model_name.replace("/", "_") + ".vmfb" + model.get_compiled_module(save_to=vmfb_name) + + # if model is not supposed to run e2e, exit at this point (mlir has been uploaded) + if run_e2e is False: + assert expected_err > 0 + return + + # run inference using iree runtime + runner = vmfbRunner("local-task", vmfb_name) + inputs = [ireert.asdevicearray(runner.config.device, input)] + keys = list(runner.ctx.modules) + key = keys[len(keys) - 1] + results = runner.ctx.modules.__getattr__(key)["main"](*inputs) + err = utils.largest_error(out.cpu().detach().numpy(), results) + # cleanup + os.remove(vmfb_name) + # accuracy + assert err < expected_err diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py new file mode 100644 index 000000000..4832e9e24 --- /dev/null +++ b/models/turbine_models/turbine_tank/tank_util.py @@ -0,0 +1,260 @@ +import torch +import numpy as np + +torch.manual_seed(0) + +BATCH_SIZE = 1 + +model_list = [ + ("microsoft/resnet-50", "hf_img_cls"), + ("bert-large-uncased", "hf"), + ("facebook/deit-small-distilled-patch16-224", "hf_img_cls"), + ("google/vit-base-patch16-224", "hf_img_cls"), + ("microsoft/beit-base-patch16-224-pt22k-ft22k", "hf_img_cls"), + ("microsoft/MiniLM-L12-H384-uncased", "hf"), + ("google/mobilebert-uncased", "hf"), + ("mobilenet_v3_small", "vision"), + ("nvidia/mit-b0", "hf_img_cls"), + ("resnet101", "vision"), + ("resnet18", "vision"), + ("resnet50", "vision"), + ("squeezenet1_0", "vision"), + ("wide_resnet50_2", "vision"), + ("mnasnet1_0", "vision"), + ("t5-base", "hf_seq2seq"), # iree-compile failure + ("t5-large", "hf_seq2seq"), # iree-compile failure + ("openai/whisper-base", "hf_causallm"), + ("openai/whisper-small", "hf_causallm"), + ("openai/whisper-medium", "hf_causallm"), + ("facebook/opt-350m", "hf"), + ("facebook/opt-1.3b", "hf"), + ("BAAI/bge-base-en-v1.5", "hf"), + ("facebook/bart-large", "hf_seq2seq"), # iree-compile fails + ("gpt2", "hf"), # iree-compile fails + ("gpt2-xl", "hf"), # iree-compile fails + ("lmsys/vicuna-13b-v1.3", "hf"), + ("microsoft/phi-1_5", "hf_causallm"), # nan error reported (correctness issue) + ("microsoft/phi-2", "hf_causallm"), # nan error reported (correctness issue) + ("mosaicml/mpt-30b", "hf_causallm"), # iree-compile fails + ("stabilityai/stablelm-3b-4e1t", "hf_causallm"), +] + + +##################### Hugging Face Image Classification Models ################################### +from transformers import AutoModelForImageClassification +from transformers import AutoFeatureExtractor +from PIL import Image +import requests + + +def preprocess_input_image(model_name): + # from datasets import load_dataset + # dataset = load_dataset("huggingface/cats-image") + # image1 = dataset["test"]["image"][0] + # # print("image1: ", image1) # + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + # + image = Image.open(requests.get(url, stream=True).raw) + # feature_extractor = img_models_fe_dict[model_name].from_pretrained( + # model_name + # ) + feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) + inputs = feature_extractor(images=image, return_tensors="pt") + # inputs = {'pixel_values': tensor([[[[ 0.1137..., -0.2000, -0.4275, -0.5294]]]])} + # torch.Size([1, 3, 224, 224]), torch.FloatTensor + + return inputs[str(*inputs)] + + +class HuggingFaceImageClassification(torch.nn.Module): + def __init__(self, hf_model_name): + super().__init__() + self.model = AutoModelForImageClassification.from_pretrained( + hf_model_name, # The pretrained model. + output_attentions=False, # Whether the model returns attentions weights. + return_dict=False, # https://github.com/huggingface/transformers/issues/9095 + torchscript=True, + ) + + def forward(self, inputs): + return self.model.forward(inputs)[0] + + +def get_hf_img_cls_model(name, import_args): + model = HuggingFaceImageClassification(name) + # you can use preprocess_input_image to get the test_input or just random value. + test_input = preprocess_input_image(name) + # test_input = torch.FloatTensor(1, 3, 224, 224).uniform_(-1, 1) + # print("test_input.shape: ", test_input.shape) + # test_input.shape: torch.Size([1, 3, 224, 224]) + test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1) + actual_out = model(test_input) + # actual_out.shape: torch.Size([1, 1000]) + return model, test_input, actual_out + + +##################### Hugging Face LM Models ################################### + + +class HuggingFaceLanguage(torch.nn.Module): + def __init__(self, hf_model_name): + super().__init__() + from transformers import AutoModelForSequenceClassification, AutoTokenizer + import transformers as trf + + transformers_path = trf.__path__[0] + hf_model_path = f"{transformers_path}/models/{hf_model_name}" + self.model = AutoModelForSequenceClassification.from_pretrained( + hf_model_name, # The pretrained model. + num_labels=2, # The number of output labels--2 for binary classification. + output_attentions=False, # Whether the model returns attentions weights. + output_hidden_states=False, # Whether the model returns all hidden-states. + torchscript=True, + ) + self.model.config.pad_token_id = None + + def forward(self, tokens): + return self.model.forward(tokens)[0] + + +def get_hf_model(name, import_args): + from transformers import ( + BertTokenizer, + ) + + model = HuggingFaceLanguage(name) + test_input = torch.randint(2, (int(import_args["batch_size"]), 128)) + actual_out = model(test_input) + return model, test_input, actual_out + + +##################### Hugging Face Seq2SeqLM Models ################################### + +# We use a maximum sequence length of 512 since this is the default used in the T5 config. +T5_MAX_SEQUENCE_LENGTH = 512 + + +class HFSeq2SeqLanguageModel(torch.nn.Module): + def __init__(self, model_name): + super().__init__() + from transformers import AutoTokenizer, T5Model + + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.tokenization_kwargs = { + "pad_to_multiple_of": T5_MAX_SEQUENCE_LENGTH, + "padding": True, + "return_tensors": "pt", + } + self.model = T5Model.from_pretrained(model_name, return_dict=True) + + def preprocess_input(self, text): + return self.tokenizer(text, **self.tokenization_kwargs) + + def forward(self, input_ids, decoder_input_ids): + return self.model.forward(input_ids, decoder_input_ids=decoder_input_ids)[0] + + +def get_hf_seq2seq_model(name, import_args): + m = HFSeq2SeqLanguageModel(name) + encoded_input_ids = m.preprocess_input( + "Studies have been shown that owning a dog is good for you" + ).input_ids + decoder_input_ids = m.preprocess_input("Studies show that").input_ids + decoder_input_ids = m.model._shift_right(decoder_input_ids) + + test_input = (encoded_input_ids, decoder_input_ids) + actual_out = m.forward(*test_input) + return m, test_input, actual_out + + +##################### Hugging Face CausalLM Models ################################### +from transformers import AutoTokenizer, AutoModelForCausalLM + + +def prepare_sentence_tokens(hf_model: str, sentence: str): + tokenizer = AutoTokenizer.from_pretrained(hf_model) + return torch.tensor([tokenizer.encode(sentence)]) + + +class HFCausalLM(torch.nn.Module): + def __init__(self, model_name: str): + super().__init__() + self.model = AutoModelForCausalLM.from_pretrained( + model_name, # The pretrained model name. + # The number of output labels--2 for binary classification. + num_labels=2, + # Whether the model returns attentions weights. + output_attentions=False, + # Whether the model returns all hidden-states. + output_hidden_states=False, + torchscript=True, + trust_remote_code=True, + ) + self.model.eval() + + def forward(self, tokens): + return self.model.forward(tokens)[0] + + +def get_hf_causallm_model(name, import_args): + m = HFCausalLM(name) + test_input = prepare_sentence_tokens(name, "this project is very interesting") + actual_out = m.forward(test_input) + return m, test_input, actual_out + + +################################################################################ + +##################### Torch Vision Models ################################### + + +class VisionModule(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + self.train(False) + + def forward(self, input): + return self.model.forward(input) + + +def get_vision_model(torch_model, import_args): + import torchvision.models as models + + default_image_size = (224, 224) + modelname = torch_model + if modelname == "alexnet": + torch_model = models.alexnet(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "resnet18": + torch_model = models.resnet18(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "resnet50": + torch_model = models.resnet50(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "resnet50_fp16": + torch_model = models.resnet50(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "resnet50_fp16": + torch_model = models.resnet50(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "resnet101": + torch_model = models.resnet101(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "squeezenet1_0": + torch_model = models.squeezenet1_0(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "wide_resnet50_2": + torch_model = models.wide_resnet50_2(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "mobilenet_v3_small": + torch_model = models.mobilenet_v3_small(weights="DEFAULT") + input_image_size = default_image_size + if modelname == "mnasnet1_0": + torch_model = models.mnasnet1_0(weights="DEFAULT") + input_image_size = default_image_size + + model = VisionModule(torch_model) + test_input = torch.randn(int(import_args["batch_size"]), 3, *input_image_size) + actual_out = model(test_input) + return model, test_input, actual_out diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py index 3d06d8009..708218c75 100644 --- a/models/turbine_models/turbine_tank/turbine_tank.py +++ b/models/turbine_models/turbine_tank/turbine_tank.py @@ -90,10 +90,12 @@ def download_public_folder(model_name: str, prefix: str, model_dir: str): container=container_name ) blob_list = container_client.list_blobs(name_starts_with=prefix) + empty = True # go through the blobs with our target prefix # example prefix: "2024-02-13_26d6428/CompVis_stable-diffusion-v1-4-clip" for blob in blob_list: + empty = False blob_client = blob_service_client.get_blob_client( container=container_name, blob=blob.name ) @@ -108,6 +110,12 @@ def download_public_folder(model_name: str, prefix: str, model_dir: str): download_stream = blob_client.download_blob() sample_blob.write(download_stream.readall()) + if empty: + print(f"Model ({model_name}) has not been uploaded yet") + return True + + return False + # sort blobs by last modified def compare(item1, item2): @@ -140,11 +148,13 @@ def downloadModelArtifacts(model_name: str) -> str: print("Already downloaded most recent version") return "NA" # download the model artifacts (passing in the model name, path in azure storage to model artifacts, local directory to store) - download_public_folder( + blobDNE = download_public_folder( model_name, download_latest_prefix + "/" + model_name, os.path.join(model_dir, download_latest_prefix), ) + if blobDNE: + return model_dir = os.path.join(WORKDIR, model_name + "/" + download_latest_prefix) mlir_filename = os.path.join(model_dir, model_name + ".mlir") print( From b1ad5726f8df9094dc487309b1d8186f2872fcd0 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 21 Feb 2024 02:10:49 -0800 Subject: [PATCH 05/20] formatting --- models/turbine_models/custom_models/sd_inference/unet.py | 1 - models/turbine_models/custom_models/sd_inference/vae.py | 1 - 2 files changed, 2 deletions(-) diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py index f65af556c..829a8c0bc 100644 --- a/models/turbine_models/custom_models/sd_inference/unet.py +++ b/models/turbine_models/custom_models/sd_inference/unet.py @@ -99,7 +99,6 @@ def export_unet_model( max_alloc=None, upload_ir=False, ): - mapper = {} utils.save_external_weights( mapper, unet_model, external_weights, external_weight_path diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py index 5d62edf1f..885ac6e60 100644 --- a/models/turbine_models/custom_models/sd_inference/vae.py +++ b/models/turbine_models/custom_models/sd_inference/vae.py @@ -98,7 +98,6 @@ def export_vae_model( variant="decode", upload_ir=False, ): - mapper = {} utils.save_external_weights( mapper, vae_model, external_weights, external_weight_path From a465a483e2b0888c7065498d7800abe25ebd021f Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 21 Feb 2024 02:56:02 -0800 Subject: [PATCH 06/20] remove unnecessary upload_ir var pass in utils --- models/turbine_models/custom_models/sd_inference/clip.py | 4 +--- models/turbine_models/custom_models/sd_inference/unet.py | 4 +--- models/turbine_models/custom_models/sd_inference/utils.py | 4 +--- models/turbine_models/custom_models/sd_inference/vae.py | 4 +--- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py index d7ed96561..2c09b13fb 100644 --- a/models/turbine_models/custom_models/sd_inference/clip.py +++ b/models/turbine_models/custom_models/sd_inference/clip.py @@ -115,9 +115,7 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): if compile_to != "vmfb": return module_str, tokenizer else: - utils.compile_to_vmfb( - module_str, device, target_triple, max_alloc, safe_name, upload_ir - ) + utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name) if __name__ == "__main__": diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py index 829a8c0bc..3a42dd918 100644 --- a/models/turbine_models/custom_models/sd_inference/unet.py +++ b/models/turbine_models/custom_models/sd_inference/unet.py @@ -145,9 +145,7 @@ def main( if compile_to != "vmfb": return module_str else: - utils.compile_to_vmfb( - module_str, device, target_triple, max_alloc, safe_name, upload_ir - ) + utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name) if __name__ == "__main__": diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py index 3d5d2a0a2..37787fd3a 100644 --- a/models/turbine_models/custom_models/sd_inference/utils.py +++ b/models/turbine_models/custom_models/sd_inference/utils.py @@ -26,9 +26,7 @@ def largest_error(array1, array2): return max_error -def compile_to_vmfb( - module_str, device, target_triple, max_alloc, safe_name, upload_ir=False -): +def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name): flags = [ "--iree-input-type=torch", "--mlir-print-debuginfo", diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py index 885ac6e60..8ba0fb6bb 100644 --- a/models/turbine_models/custom_models/sd_inference/vae.py +++ b/models/turbine_models/custom_models/sd_inference/vae.py @@ -133,9 +133,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): if compile_to != "vmfb": return module_str else: - utils.compile_to_vmfb( - module_str, device, target_triple, max_alloc, safe_name, upload_ir - ) + utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name) if __name__ == "__main__": From ac8997658becb8ade883938d73ce3ef34600b108 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 21 Feb 2024 12:31:04 -0800 Subject: [PATCH 07/20] address comments --- models/turbine_models/custom_models/sd_inference/clip.py | 7 ------- models/turbine_models/custom_models/sd_inference/unet.py | 7 ------- models/turbine_models/custom_models/sd_inference/vae.py | 7 ------- models/turbine_models/custom_models/stateless_llama.py | 7 ------- models/turbine_models/turbine_tank/turbine_tank.py | 9 ++++----- 5 files changed, 4 insertions(+), 33 deletions(-) diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py index 2c09b13fb..4cc5f91dd 100644 --- a/models/turbine_models/custom_models/sd_inference/clip.py +++ b/models/turbine_models/custom_models/sd_inference/clip.py @@ -47,12 +47,6 @@ help="Specify vulkan target triple or rocm/cuda target device.", ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") -parser.add_argument( - "--upload_ir", - action=argparse.BooleanOptionalAction, - default=False, - help="upload IR to turbine tank", -) def export_clip_model( @@ -129,7 +123,6 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): args.device, args.iree_target_triple, args.vulkan_max_allocation, - args.upload_ir, ) safe_name = args.hf_model_name.split("/")[-1].strip() safe_name = re.sub("-", "_", safe_name) diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py index 3a42dd918..2c1556e84 100644 --- a/models/turbine_models/custom_models/sd_inference/unet.py +++ b/models/turbine_models/custom_models/sd_inference/unet.py @@ -54,12 +54,6 @@ help="Specify vulkan target triple or rocm/cuda target device.", ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") -parser.add_argument( - "--upload_ir", - action=argparse.BooleanOptionalAction, - default=False, - help="upload IR to turbine tank", -) class UnetModel(torch.nn.Module): @@ -167,7 +161,6 @@ def main( args.device, args.iree_target_triple, args.vulkan_max_allocation, - args.upload_ir, ) safe_name = utils.create_safe_name(args.hf_model_name, "-unet") with open(f"{safe_name}.mlir", "w+") as f: diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py index 8ba0fb6bb..fcf9453b4 100644 --- a/models/turbine_models/custom_models/sd_inference/vae.py +++ b/models/turbine_models/custom_models/sd_inference/vae.py @@ -55,12 +55,6 @@ ) parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296") parser.add_argument("--variant", type=str, default="decode") -parser.add_argument( - "--upload_ir", - action=argparse.BooleanOptionalAction, - default=False, - help="upload IR to turbine tank", -) class VaeModel(torch.nn.Module): @@ -156,7 +150,6 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): args.iree_target_triple, args.vulkan_max_allocation, args.variant, - args.upload_ir, ) safe_name = utils.create_safe_name(args.hf_model_name, "-vae") with open(f"{safe_name}.mlir", "w+") as f: diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py index 3fa19c12e..6863fd5c2 100644 --- a/models/turbine_models/custom_models/stateless_llama.py +++ b/models/turbine_models/custom_models/stateless_llama.py @@ -62,12 +62,6 @@ action="store_true", help="Compile LLM with StreamingLLM optimizations", ) -parser.add_argument( - "--upload_ir", - action=argparse.BooleanOptionalAction, - default=False, - help="upload IR to turbine tank", -) def generate_schema(num_layers): @@ -413,7 +407,6 @@ def evict_kvcache_space(self): args.vulkan_max_allocation, args.streaming_llm, args.vmfb_path, - args.upload_ir, ) safe_name = args.hf_model_name.split("/")[-1].strip() safe_name = re.sub("-", "_", safe_name) diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py index 708218c75..e5947cd58 100644 --- a/models/turbine_models/turbine_tank/turbine_tank.py +++ b/models/turbine_models/turbine_tank/turbine_tank.py @@ -19,12 +19,11 @@ print( f"turbine_tank local cache is located at {WORKDIR} . You may change this by assigning the TURBINE_TANK_CACHE_DIR environment variable." ) -os.makedirs(WORKDIR, exist_ok=True) -storage_account_key = "XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==" -storage_account_name = "tankturbine" -connection_string = "DefaultEndpointsProtocol=https;AccountName=tankturbine;AccountKey=XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==;EndpointSuffix=core.windows.net" -container_name = "tankturbine" +storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY") +storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME") +connection_string = os.environ.get("AZURE_CONNECTION_STRING") +container_name = os.environ.get("AZURE_CONTAINER_NAME") def get_short_git_sha() -> str: From cde33d945a216b89e5df358cb13677239ea97383 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 21 Feb 2024 12:32:04 -0800 Subject: [PATCH 08/20] add line back --- models/turbine_models/turbine_tank/turbine_tank.py | 1 + 1 file changed, 1 insertion(+) diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py index e5947cd58..8150cc157 100644 --- a/models/turbine_models/turbine_tank/turbine_tank.py +++ b/models/turbine_models/turbine_tank/turbine_tank.py @@ -19,6 +19,7 @@ print( f"turbine_tank local cache is located at {WORKDIR} . You may change this by assigning the TURBINE_TANK_CACHE_DIR environment variable." ) +os.makedirs(WORKDIR, exist_ok=True) storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY") storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME") From 385d910bd98e9242070a6bb26735ec538009cdb0 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 21 Feb 2024 13:37:31 -0800 Subject: [PATCH 09/20] back to hardcoded credentials --- models/turbine_models/turbine_tank/turbine_tank.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py index 8150cc157..708218c75 100644 --- a/models/turbine_models/turbine_tank/turbine_tank.py +++ b/models/turbine_models/turbine_tank/turbine_tank.py @@ -21,10 +21,10 @@ ) os.makedirs(WORKDIR, exist_ok=True) -storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY") -storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME") -connection_string = os.environ.get("AZURE_CONNECTION_STRING") -container_name = os.environ.get("AZURE_CONTAINER_NAME") +storage_account_key = "XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==" +storage_account_name = "tankturbine" +connection_string = "DefaultEndpointsProtocol=https;AccountName=tankturbine;AccountKey=XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==;EndpointSuffix=core.windows.net" +container_name = "tankturbine" def get_short_git_sha() -> str: From 07bcc6ff4272c18e216b75f082a848c6246b57f7 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 21 Feb 2024 15:48:39 -0800 Subject: [PATCH 10/20] update to env vars --- models/turbine_models/turbine_tank/turbine_tank.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py index 708218c75..8150cc157 100644 --- a/models/turbine_models/turbine_tank/turbine_tank.py +++ b/models/turbine_models/turbine_tank/turbine_tank.py @@ -21,10 +21,10 @@ ) os.makedirs(WORKDIR, exist_ok=True) -storage_account_key = "XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==" -storage_account_name = "tankturbine" -connection_string = "DefaultEndpointsProtocol=https;AccountName=tankturbine;AccountKey=XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==;EndpointSuffix=core.windows.net" -container_name = "tankturbine" +storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY") +storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME") +connection_string = os.environ.get("AZURE_CONNECTION_STRING") +container_name = os.environ.get("AZURE_CONTAINER_NAME") def get_short_git_sha() -> str: From d14990169df5b0e15283e609e4a7daecca0630b6 Mon Sep 17 00:00:00 2001 From: saienduri Date: Fri, 23 Feb 2024 17:43:58 -0800 Subject: [PATCH 11/20] add support for external param flow --- models/turbine_models/model_builder.py | 52 +++--- models/turbine_models/tests/sd_test.py | 6 +- .../turbine_models/turbine_tank/run_tank.py | 2 +- .../turbine_models/turbine_tank/tank_test.py | 39 ++-- .../turbine_models/turbine_tank/tank_util.py | 174 +++++++++++++++++- .../turbine_tank/turbine_tank.py | 22 ++- 6 files changed, 233 insertions(+), 62 deletions(-) diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py index 6f5c8b578..2577a39b5 100644 --- a/models/turbine_models/model_builder.py +++ b/models/turbine_models/model_builder.py @@ -21,7 +21,7 @@ class HFTransformerBuilder: def __init__( self, example_input: torch.Tensor, - hf_id: str, + hf_id: str = None, auto_model: AutoModel = AutoModel, auto_tokenizer: AutoTokenizer = None, auto_config: AutoConfig = None, @@ -50,15 +50,16 @@ def build_model(self) -> None: Builds a PyTorch model using Hugging Face's transformers library. """ # TODO: check cloud storage for existing ir - self.model = self.auto_model.from_pretrained( - self.hf_id, token=self.hf_auth_token, config=self.auto_config - ) - if self.auto_tokenizer is not None: - self.tokenizer = self.auto_tokenizer.from_pretrained( - self.hf_id, token=self.hf_auth_token + if self.hf_id: + self.model = self.auto_model.from_pretrained( + self.hf_id, token=self.hf_auth_token, config=self.auto_config ) - else: - self.tokenizer = None + if self.auto_tokenizer is not None: + self.tokenizer = self.auto_tokenizer.from_pretrained( + self.hf_id, token=self.hf_auth_token + ) + else: + self.tokenizer = None def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule: """ @@ -74,19 +75,20 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule: module = aot.export(self.model, *self.example_input) else: module = aot.export(self.model, self.example_input) - module_str = str(module.mlir_module) - safe_name = self.hf_id.split("/")[-1].strip() - safe_name = re.sub("-", "_", safe_name) - if self.upload_ir: - with open(f"{safe_name}.mlir", "w+") as f: - f.write(module_str) - model_name_upload = self.hf_id.replace("/", "_") - turbine_tank.uploadToBlobStorage( - str(os.path.abspath(f"{safe_name}.mlir")), - f"{model_name_upload}/{model_name_upload}.mlir", - ) - os.remove(f"{safe_name}.mlir") - if self.run_e2e is not None and self.run_e2e is False: - return - compiled_binary = module.compile(save_to=save_to) - return compiled_binary + if self.hf_id: + module_str = str(module.mlir_module) + safe_name = self.hf_id.split("/")[-1].strip() + safe_name = re.sub("-", "_", safe_name) + if self.upload_ir: + with open(f"{safe_name}.mlir", "w+") as f: + f.write(module_str) + model_name_upload = self.hf_id.replace("/", "_") + turbine_tank.uploadToBlobStorage( + str(os.path.abspath(f"{safe_name}.mlir")), + f"{model_name_upload}/{model_name_upload}.mlir", + ) + os.remove(f"{safe_name}.mlir") + if self.run_e2e is not None and self.run_e2e is False: + return + compiled_binary = module.compile(save_to=save_to) + return compiled_binary diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py index 01887db72..f555f5fdd 100644 --- a/models/turbine_models/tests/sd_test.py +++ b/models/turbine_models/tests/sd_test.py @@ -262,9 +262,9 @@ def testExportPNDMScheduler(self): "cpu", ) self.assertEqual(cm.exception.code, None) - arguments[ - "external_weight_path" - ] = "stable_diffusion_v1_4_scheduler.safetensors" + arguments["external_weight_path"] = ( + "stable_diffusion_v1_4_scheduler.safetensors" + ) arguments["vmfb_path"] = "stable_diffusion_v1_4_scheduler.vmfb" sample = torch.rand( arguments["batch_size"], diff --git a/models/turbine_models/turbine_tank/run_tank.py b/models/turbine_models/turbine_tank/run_tank.py index 816d92322..a4e77db07 100644 --- a/models/turbine_models/turbine_tank/run_tank.py +++ b/models/turbine_models/turbine_tank/run_tank.py @@ -1,4 +1,4 @@ -# Copyright 2023 Nod Labs, Inc +# Copyright 2024 Advanced Micro Devices, Inc # # Licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/models/turbine_models/turbine_tank/tank_test.py b/models/turbine_models/turbine_tank/tank_test.py index 17c8ec8ba..d1f825e46 100644 --- a/models/turbine_models/turbine_tank/tank_test.py +++ b/models/turbine_models/turbine_tank/tank_test.py @@ -1,10 +1,12 @@ +# Copyright 2024 Advanced Micro Devices, Inc +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + import pytest from turbine_models.turbine_tank import tank_util from turbine_models.model_builder import HFTransformerBuilder -from turbine_models.model_runner import vmfbRunner -from turbine_models.custom_models.sd_inference import utils -from iree import runtime as ireert -import os @pytest.mark.parametrize( @@ -95,6 +97,7 @@ def test_all_models(model_name, model_type, expected_err, run_e2e): import_args = { "batch_size": 1, } + # Based on the model type, get the appropriate hugging face model, inputs, and output if model_type == "vision": torch_model, input, out = tank_util.get_vision_model(model_name, import_args) @@ -112,32 +115,20 @@ def test_all_models(model_name, model_type, expected_err, run_e2e): torch_model, input, out = tank_util.get_hf_img_cls_model( model_name, import_args ) - # compile model and get vmfb + + # create hugging face transformer model model = HFTransformerBuilder( example_input=input, hf_id=model_name, - hf_auth_token="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", upload_ir=True, model=torch_model, model_type=model_type, run_e2e=run_e2e, ) - vmfb_name = model_name.replace("/", "_") + ".vmfb" - model.get_compiled_module(save_to=vmfb_name) - # if model is not supposed to run e2e, exit at this point (mlir has been uploaded) - if run_e2e is False: - assert expected_err > 0 - return - - # run inference using iree runtime - runner = vmfbRunner("local-task", vmfb_name) - inputs = [ireert.asdevicearray(runner.config.device, input)] - keys = list(runner.ctx.modules) - key = keys[len(keys) - 1] - results = runner.ctx.modules.__getattr__(key)["main"](*inputs) - err = utils.largest_error(out.cpu().detach().numpy(), results) - # cleanup - os.remove(vmfb_name) - # accuracy - assert err < expected_err + # runs using external params + tank_util.param_flow( + model, model_name, model_type, input, out, run_e2e, expected_err + ) + # inline weights + tank_util.classic_flow(model, model_name, input, out, run_e2e, expected_err) diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py index 4832e9e24..718caf9d7 100644 --- a/models/turbine_models/turbine_tank/tank_util.py +++ b/models/turbine_models/turbine_tank/tank_util.py @@ -1,5 +1,20 @@ +# Copyright 2024 Advanced Micro Devices, Inc +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import torch +import iree.compiler as ireec import torch -import numpy as np +from turbine_models.turbine_tank import tank_util +from turbine_models.model_runner import vmfbRunner +from turbine_models.custom_models.sd_inference import utils +from iree import runtime as ireert +import os +from shark_turbine.aot import * +from iree.compiler.ir import Context +from turbine_models.turbine_tank import turbine_tank torch.manual_seed(0) @@ -88,6 +103,7 @@ def get_hf_img_cls_model(name, import_args): # print("test_input.shape: ", test_input.shape) # test_input.shape: torch.Size([1, 3, 224, 224]) test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1) + print(f"YOOO TEST INPUT: {test_input.shape}") actual_out = model(test_input) # actual_out.shape: torch.Size([1, 1000]) return model, test_input, actual_out @@ -118,9 +134,6 @@ def forward(self, tokens): def get_hf_model(name, import_args): - from transformers import ( - BertTokenizer, - ) model = HuggingFaceLanguage(name) test_input = torch.randint(2, (int(import_args["batch_size"]), 128)) @@ -172,7 +185,9 @@ def get_hf_seq2seq_model(name, import_args): def prepare_sentence_tokens(hf_model: str, sentence: str): - tokenizer = AutoTokenizer.from_pretrained(hf_model) + tokenizer = AutoTokenizer.from_pretrained( + hf_model, token="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" + ) return torch.tensor([tokenizer.encode(sentence)]) @@ -189,6 +204,7 @@ def __init__(self, model_name: str): output_hidden_states=False, torchscript=True, trust_remote_code=True, + token="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", ) self.model.eval() @@ -258,3 +274,151 @@ def get_vision_model(torch_model, import_args): test_input = torch.randn(int(import_args["batch_size"]), 3, *input_image_size) actual_out = model(test_input) return model, test_input, actual_out + + +def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name): + flags = [ + "--iree-input-type=torch", + "--mlir-print-debuginfo", + "--mlir-print-op-on-diagnostic=false", + "--iree-llvmcpu-target-cpu-features=host", + "--iree-llvmcpu-target-triple=x86_64-linux-gnu", + "--iree-stream-resource-index-bits=64", + "--iree-vm-target-index-bits=64", + "--iree-flow-inline-constants-max-byte-length=1", + ] + if device == "cpu": + flags.append("--iree-llvmcpu-enable-ukernels=all") + device = "llvm-cpu" + elif device == "vulkan": + flags.extend( + [ + "--iree-hal-target-backends=vulkan-spirv", + "--iree-vulkan-target-triple=" + target_triple, + "--iree-stream-resource-max-allocation-size=" + max_alloc, + ] + ) + elif device == "rocm": + flags.extend( + [ + "--iree-hal-target-backends=rocm", + "--iree-rocm-target-chip=" + target_triple, + "--iree-rocm-link-bc=true", + "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode", + "--iree-vm-bytecode-module-strip-source-map=true", + "--iree-opt-strip-assertions=true", + "--iree-vm-target-truncate-unsupported-floats", + ] + ) + elif device == "cuda": + flags.extend( + [ + "--iree-hal-target-backends=cuda", + "--iree-hal-cuda-llvm-target-arch=" + target_triple, + "--iree-vm-bytecode-module-strip-source-map=true", + "--iree-vm-target-truncate-unsupported-floats", + ] + ) + else: + print("incorrect device: ", device) + + flatbuffer_blob = ireec.compile_str( + module_str, + target_backends=[device], + extra_args=flags, + ) + with open(f"{safe_name}.vmfb", "wb+") as f: + f.write(flatbuffer_blob) + print("Saved to", safe_name + ".vmfb") + + +def classic_flow(model, model_name, input, out, run_e2e, expected_err): + vmfb_name = model_name.replace("/", "_") + ".vmfb" + model.get_compiled_module(save_to=vmfb_name) + + # if model is not supposed to run e2e, exit at this point (mlir has been uploaded) + if run_e2e is False: + assert expected_err > 0 + return + + # run inference using iree runtime + runner = vmfbRunner("local-task", vmfb_name) + inputs = [ireert.asdevicearray(runner.config.device, input)] + keys = list(runner.ctx.modules) + key = keys[len(keys) - 1] + results = runner.ctx.modules.__getattr__(key)["main"](*inputs) + err = utils.largest_error(out.cpu().detach().numpy(), results) + # cleanup + os.remove(vmfb_name) + # accuracy + assert err < expected_err + + +def param_flow(model, model_name, model_type, input, out, run_e2e, expected_err): + weight_name = model_name.replace("/", "_") + ".safetensors" + mapper = {} + utils.save_external_weights(mapper, model.model, "safetensors", weight_name) + + # seq2seq models differs from rest as it take two inputs (input_ids, decoder_input_ids) + if model_type == "hf_seq2seq": + + class Seq2SeqModule(CompiledModule): + params = export_parameters( + model.model, external=True, external_scope="", name_mapper=mapper.get + ) + + def main( + self, + inp1=AbstractTensor(*(input[0].shape), dtype=input[0].dtype), + inp2=AbstractTensor(*(input[1].shape), dtype=input[1].dtype), + ): + return jittable(model.model.forward)(inp1, inp2) + + inst = Seq2SeqModule(context=Context(), import_to="IMPORT") + module_str = str(CompiledModule.get_mlir_module(inst)) + else: + + class GlobalModule(CompiledModule): + params = export_parameters( + model.model, external=True, external_scope="", name_mapper=mapper.get + ) + + def main(self, inp=AbstractTensor(*input.shape, dtype=input.dtype)): + return jittable(model.model.forward)(inp) + + inst = GlobalModule(context=Context(), import_to="IMPORT") + module_str = str(CompiledModule.get_mlir_module(inst)) + + mlir_name = model_name.replace("/", "_") + ".mlir" + with open(mlir_name, "w+") as f: + f.write(module_str) + + model_name_upload = model_name.replace("/", "_") + turbine_tank.uploadToBlobStorage( + str(os.path.abspath(mlir_name)), + f"{model_name_upload}/{model_name_upload}-params.mlir", + ) + + os.remove(mlir_name) + + if run_e2e is False: + assert expected_err > 0 + return + + vmfb_name = model_name.replace("/", "_") + tank_util.compile_to_vmfb(module_str, "cpu", "", "", vmfb_name) + + # run inference using iree runtime + runner = vmfbRunner("local-task", vmfb_name + ".vmfb", weight_name) + inputs = [ireert.asdevicearray(runner.config.device, input)] + keys = list(runner.ctx.modules) + key = keys[len(keys) - 1] + results = runner.ctx.modules.__getattr__(key)["main"](*inputs) + err = utils.largest_error(out.cpu().detach().numpy(), results) + + # clean up + os.remove(vmfb_name + ".vmfb") + os.remove(weight_name) + + # accuracy + assert err < expected_err diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py index 8150cc157..36dc07a4a 100644 --- a/models/turbine_models/turbine_tank/turbine_tank.py +++ b/models/turbine_models/turbine_tank/turbine_tank.py @@ -1,3 +1,9 @@ +# Copyright 2024 Advanced Micro Devices, Inc +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + from azure.storage.blob import BlobServiceClient import subprocess @@ -65,7 +71,7 @@ def uploadToBlobStorage(file_path, file_name): def checkAndRemoveIfDownloadedOld(model_name: str, model_dir: str, prefix: str): - if os.path.isdir(model_dir) and len(os.listdir(model_dir)) == 1: + if os.path.isdir(model_dir) and len(os.listdir(model_dir)) > 0: for item in os.listdir(model_dir): item_path = os.path.join(model_dir, item) # model artifacts already downloaded and up to date @@ -79,6 +85,12 @@ def checkAndRemoveIfDownloadedOld(model_name: str, model_dir: str, prefix: str): os.remove(os.path.join(item_path, model_name + ".mlir")) os.rmdir(item_path) return False + if os.path.isdir(item_path) and os.path.isfile( + os.path.join(item_path, model_name + "-param.mlir") + ): + os.remove(os.path.join(item_path, model_name + "-param.mlir")) + os.rmdir(item_path) + return False # did not downloaded this model artifacts yet return False @@ -104,9 +116,11 @@ def download_public_folder(model_name: str, prefix: str, model_dir: str): if not os.path.isdir(dest_path): os.makedirs(dest_path) # download blob into local turbine tank cache - with open( - file=os.path.join(model_dir, model_name + ".mlir"), mode="wb" - ) as sample_blob: + if "param" in blob.name: + file_path = os.path.join(model_dir, model_name + "-param.mlir") + else: + file_path = os.path.join(model_dir, model_name + ".mlir") + with open(file=file_path, mode="wb") as sample_blob: download_stream = blob_client.download_blob() sample_blob.write(download_stream.readall()) From f97e3ce09c6a7e099281033c09072eadfad875d5 Mon Sep 17 00:00:00 2001 From: saienduri Date: Sun, 25 Feb 2024 22:54:57 -0800 Subject: [PATCH 12/20] formatting --- models/turbine_models/tests/sd_test.py | 6 +++--- models/turbine_models/turbine_tank/tank_util.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py index f555f5fdd..01887db72 100644 --- a/models/turbine_models/tests/sd_test.py +++ b/models/turbine_models/tests/sd_test.py @@ -262,9 +262,9 @@ def testExportPNDMScheduler(self): "cpu", ) self.assertEqual(cm.exception.code, None) - arguments["external_weight_path"] = ( - "stable_diffusion_v1_4_scheduler.safetensors" - ) + arguments[ + "external_weight_path" + ] = "stable_diffusion_v1_4_scheduler.safetensors" arguments["vmfb_path"] = "stable_diffusion_v1_4_scheduler.vmfb" sample = torch.rand( arguments["batch_size"], diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py index 718caf9d7..90b80829c 100644 --- a/models/turbine_models/turbine_tank/tank_util.py +++ b/models/turbine_models/turbine_tank/tank_util.py @@ -134,7 +134,6 @@ def forward(self, tokens): def get_hf_model(name, import_args): - model = HuggingFaceLanguage(name) test_input = torch.randint(2, (int(import_args["batch_size"]), 128)) actual_out = model(test_input) From 8f48a800b6f7af97477e343468de5a49a98db614 Mon Sep 17 00:00:00 2001 From: saienduri Date: Mon, 26 Feb 2024 22:27:10 -0800 Subject: [PATCH 13/20] remove debug --- models/turbine_models/turbine_tank/tank_util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py index 90b80829c..154d3dc34 100644 --- a/models/turbine_models/turbine_tank/tank_util.py +++ b/models/turbine_models/turbine_tank/tank_util.py @@ -103,7 +103,6 @@ def get_hf_img_cls_model(name, import_args): # print("test_input.shape: ", test_input.shape) # test_input.shape: torch.Size([1, 3, 224, 224]) test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1) - print(f"YOOO TEST INPUT: {test_input.shape}") actual_out = model(test_input) # actual_out.shape: torch.Size([1, 1000]) return model, test_input, actual_out From 826e251eb93beb67fda0741e7db8fd96d90c294b Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 28 Feb 2024 14:00:49 -0800 Subject: [PATCH 14/20] moving turbine tank out to test suite --- .../tests/stateless_llama_test.py | 8 +- .../turbine_models/turbine_tank/run_tank.py | 61 --- .../turbine_models/turbine_tank/tank_test.py | 134 ------ .../turbine_models/turbine_tank/tank_util.py | 422 ------------------ 4 files changed, 6 insertions(+), 619 deletions(-) delete mode 100644 models/turbine_models/turbine_tank/run_tank.py delete mode 100644 models/turbine_models/turbine_tank/tank_test.py delete mode 100644 models/turbine_models/turbine_tank/tank_util.py diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py index c72c55e55..1e87120fa 100644 --- a/models/turbine_models/tests/stateless_llama_test.py +++ b/models/turbine_models/tests/stateless_llama_test.py @@ -68,7 +68,9 @@ def test_vmfb_comparison(self): upload_ir=upload_ir_var == "upload", ) - torch_str_cache_path = f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt" + torch_str_cache_path = ( + f"vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt" + ) # if cached, just read if os.path.exists(torch_str_cache_path): with open(torch_str_cache_path, "r") as f: @@ -109,7 +111,9 @@ def test_streaming_vmfb_comparison(self): vmfb_path="streaming_llama.vmfb", ) - torch_str_cache_path = f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt" + torch_str_cache_path = ( + f"vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt" + ) # if cached, just read if os.path.exists(torch_str_cache_path): with open(torch_str_cache_path, "r") as f: diff --git a/models/turbine_models/turbine_tank/run_tank.py b/models/turbine_models/turbine_tank/run_tank.py deleted file mode 100644 index a4e77db07..000000000 --- a/models/turbine_models/turbine_tank/run_tank.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2024 Advanced Micro Devices, Inc -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -import argparse -import unittest -from turbine_models.turbine_tank import tank_util - -import turbine_models.tests.sd_test as sd_test -import os -from turbine_models.turbine_tank import turbine_tank - -import pytest - -parser = argparse.ArgumentParser() -parser.add_argument( - "--download_ir", - action=argparse.BooleanOptionalAction, - default=False, - help="download IR from turbine tank", -) - -if __name__ == "__main__": - args = parser.parse_args() - - if args.download_ir: - turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-clip") - turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-vae-decode") - turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-vae-encode") - turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-unet") - turbine_tank.downloadModelArtifacts( - "Trelis/Llama-2-7b-chat-hf-function-calling-v2" - ) - for model_name, _ in tank_util.model_list: - turbine_tank.downloadModelArtifacts(model_name) - else: - import turbine_models.tests.stateless_llama_test as stateless_llama_test - - # environment variable used to let the llama/sd tests know we are running from tank and want to upload - os.environ["TURBINE_TANK_ACTION"] = "upload" - - # run existing turbine llama and sd tests integrated with turbine tank - llama_suite = unittest.TestLoader().loadTestsFromModule(stateless_llama_test) - unittest.TextTestRunner(verbosity=2).run(llama_suite) - - sd_suite = unittest.TestLoader().loadTestsFromModule(sd_test) - unittest.TextTestRunner(verbosity=2).run(sd_suite) - - # cleanup - os.remove("Llama_2_7b_chat_hf_function_calling_v2_f32_unquantized.safetensors") - os.remove("Llama_2_7b_chat_hf_function_calling_v2.mlir") - os.remove("Llama_2_7b_chat_hf_function_calling_v2.vmfb") - os.remove("streaming_llama.vmfb") - os.remove("stable_diffusion_v1_4_clip.mlir") - os.remove("stable_diffusion_v1_4_unet.mlir") - os.remove("stable_diffusion_v1_4_vae.mlir") - - # runs tank_test.py (only pytest file in this directory, runs 30 models e2e) - pytest.main(["-v", os.path.dirname(os.path.abspath(__file__))]) diff --git a/models/turbine_models/turbine_tank/tank_test.py b/models/turbine_models/turbine_tank/tank_test.py deleted file mode 100644 index d1f825e46..000000000 --- a/models/turbine_models/turbine_tank/tank_test.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2024 Advanced Micro Devices, Inc -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -import pytest -from turbine_models.turbine_tank import tank_util -from turbine_models.model_builder import HFTransformerBuilder - - -@pytest.mark.parametrize( - "model_name,model_type,expected_err,run_e2e", - [ - ("microsoft/resnet-50", "hf_img_cls", 8e-05, True), - ("bert-large-uncased", "hf", 8e-06, True), - ("facebook/deit-small-distilled-patch16-224", "hf_img_cls", 8e-05, True), - ("google/vit-base-patch16-224", "hf_img_cls", 8e-05, True), - ("microsoft/beit-base-patch16-224-pt22k-ft22k", "hf_img_cls", 8e-05, True), - ("microsoft/MiniLM-L12-H384-uncased", "hf", 5e-07, True), - ("google/mobilebert-uncased", "hf", 4.3, True), - ("mobilenet_v3_small", "vision", 6e-05, True), - ("nvidia/mit-b0", "hf_img_cls", 7.3, True), - ("resnet101", "vision", 8e-06, True), - ("resnet18", "vision", 8e-06, True), - ("resnet50", "vision", 8e-06, True), - ("squeezenet1_0", "vision", 9e-06, True), - ("wide_resnet50_2", "vision", 9e-06, True), - ("mnasnet1_0", "vision", 2e-05, True), - pytest.param( - "t5-base", - "hf_seq2seq", - -1, - False, - marks=pytest.mark.xfail(reason="iree-compile fails"), - ), - pytest.param( - "t5-large", - "hf_seq2seq", - -1, - False, - marks=pytest.mark.xfail(reason="iree-compile fails"), - ), - ("openai/whisper-base", "hf_causallm", 9e-05, True), - ("openai/whisper-small", "hf_causallm", 0.0003, True), - ("openai/whisper-medium", "hf_causallm", 0.0003, True), - ("facebook/opt-350m", "hf", 9e-07, True), - ("facebook/opt-1.3b", "hf", 9e-06, True), - ("BAAI/bge-base-en-v1.5", "hf", 9e-07, True), - pytest.param( - "facebook/bart-large", - "hf_seq2seq", - -1, - False, - marks=pytest.mark.xfail(reason="iree-compile fails"), - ), - pytest.param( - "gpt2", - "hf", - -1, - False, - marks=pytest.mark.xfail(reason="iree-compile fails"), - ), - pytest.param( - "gpt2-xl", - "hf", - -1, - False, - marks=pytest.mark.xfail(reason="iree-compile fails"), - ), - ("lmsys/vicuna-13b-v1.3", "hf", 5e-05, True), - pytest.param( - "microsoft/phi-1_5", - "hf_causallm", - -1, - True, - marks=pytest.mark.xfail(reason="correctness issue"), - ), # nan error reported (correctness issue) - pytest.param( - "microsoft/phi-2", - "hf_causallm", - -1, - True, - marks=pytest.mark.xfail(reason="correctness issue"), - ), # nan error reported (correctness issue) - pytest.param( - "mosaicml/mpt-30b", - "hf_causallm", - -1, - False, - marks=pytest.mark.xfail(reason="iree-compile fails"), - ), - ("stabilityai/stablelm-3b-4e1t", "hf_causallm", 0.0004, True), - ], -) -def test_all_models(model_name, model_type, expected_err, run_e2e): - import_args = { - "batch_size": 1, - } - - # Based on the model type, get the appropriate hugging face model, inputs, and output - if model_type == "vision": - torch_model, input, out = tank_util.get_vision_model(model_name, import_args) - elif model_type == "hf": - torch_model, input, out = tank_util.get_hf_model(model_name, import_args) - elif model_type == "hf_seq2seq": - torch_model, input, out = tank_util.get_hf_seq2seq_model( - model_name, import_args - ) - elif model_type == "hf_causallm": - torch_model, input, out = tank_util.get_hf_causallm_model( - model_name, import_args - ) - elif model_type == "hf_img_cls": - torch_model, input, out = tank_util.get_hf_img_cls_model( - model_name, import_args - ) - - # create hugging face transformer model - model = HFTransformerBuilder( - example_input=input, - hf_id=model_name, - upload_ir=True, - model=torch_model, - model_type=model_type, - run_e2e=run_e2e, - ) - - # runs using external params - tank_util.param_flow( - model, model_name, model_type, input, out, run_e2e, expected_err - ) - # inline weights - tank_util.classic_flow(model, model_name, input, out, run_e2e, expected_err) diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py deleted file mode 100644 index 154d3dc34..000000000 --- a/models/turbine_models/turbine_tank/tank_util.py +++ /dev/null @@ -1,422 +0,0 @@ -# Copyright 2024 Advanced Micro Devices, Inc -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -import torch -import iree.compiler as ireec -import torch -from turbine_models.turbine_tank import tank_util -from turbine_models.model_runner import vmfbRunner -from turbine_models.custom_models.sd_inference import utils -from iree import runtime as ireert -import os -from shark_turbine.aot import * -from iree.compiler.ir import Context -from turbine_models.turbine_tank import turbine_tank - -torch.manual_seed(0) - -BATCH_SIZE = 1 - -model_list = [ - ("microsoft/resnet-50", "hf_img_cls"), - ("bert-large-uncased", "hf"), - ("facebook/deit-small-distilled-patch16-224", "hf_img_cls"), - ("google/vit-base-patch16-224", "hf_img_cls"), - ("microsoft/beit-base-patch16-224-pt22k-ft22k", "hf_img_cls"), - ("microsoft/MiniLM-L12-H384-uncased", "hf"), - ("google/mobilebert-uncased", "hf"), - ("mobilenet_v3_small", "vision"), - ("nvidia/mit-b0", "hf_img_cls"), - ("resnet101", "vision"), - ("resnet18", "vision"), - ("resnet50", "vision"), - ("squeezenet1_0", "vision"), - ("wide_resnet50_2", "vision"), - ("mnasnet1_0", "vision"), - ("t5-base", "hf_seq2seq"), # iree-compile failure - ("t5-large", "hf_seq2seq"), # iree-compile failure - ("openai/whisper-base", "hf_causallm"), - ("openai/whisper-small", "hf_causallm"), - ("openai/whisper-medium", "hf_causallm"), - ("facebook/opt-350m", "hf"), - ("facebook/opt-1.3b", "hf"), - ("BAAI/bge-base-en-v1.5", "hf"), - ("facebook/bart-large", "hf_seq2seq"), # iree-compile fails - ("gpt2", "hf"), # iree-compile fails - ("gpt2-xl", "hf"), # iree-compile fails - ("lmsys/vicuna-13b-v1.3", "hf"), - ("microsoft/phi-1_5", "hf_causallm"), # nan error reported (correctness issue) - ("microsoft/phi-2", "hf_causallm"), # nan error reported (correctness issue) - ("mosaicml/mpt-30b", "hf_causallm"), # iree-compile fails - ("stabilityai/stablelm-3b-4e1t", "hf_causallm"), -] - - -##################### Hugging Face Image Classification Models ################################### -from transformers import AutoModelForImageClassification -from transformers import AutoFeatureExtractor -from PIL import Image -import requests - - -def preprocess_input_image(model_name): - # from datasets import load_dataset - # dataset = load_dataset("huggingface/cats-image") - # image1 = dataset["test"]["image"][0] - # # print("image1: ", image1) # - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - # - image = Image.open(requests.get(url, stream=True).raw) - # feature_extractor = img_models_fe_dict[model_name].from_pretrained( - # model_name - # ) - feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) - inputs = feature_extractor(images=image, return_tensors="pt") - # inputs = {'pixel_values': tensor([[[[ 0.1137..., -0.2000, -0.4275, -0.5294]]]])} - # torch.Size([1, 3, 224, 224]), torch.FloatTensor - - return inputs[str(*inputs)] - - -class HuggingFaceImageClassification(torch.nn.Module): - def __init__(self, hf_model_name): - super().__init__() - self.model = AutoModelForImageClassification.from_pretrained( - hf_model_name, # The pretrained model. - output_attentions=False, # Whether the model returns attentions weights. - return_dict=False, # https://github.com/huggingface/transformers/issues/9095 - torchscript=True, - ) - - def forward(self, inputs): - return self.model.forward(inputs)[0] - - -def get_hf_img_cls_model(name, import_args): - model = HuggingFaceImageClassification(name) - # you can use preprocess_input_image to get the test_input or just random value. - test_input = preprocess_input_image(name) - # test_input = torch.FloatTensor(1, 3, 224, 224).uniform_(-1, 1) - # print("test_input.shape: ", test_input.shape) - # test_input.shape: torch.Size([1, 3, 224, 224]) - test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1) - actual_out = model(test_input) - # actual_out.shape: torch.Size([1, 1000]) - return model, test_input, actual_out - - -##################### Hugging Face LM Models ################################### - - -class HuggingFaceLanguage(torch.nn.Module): - def __init__(self, hf_model_name): - super().__init__() - from transformers import AutoModelForSequenceClassification, AutoTokenizer - import transformers as trf - - transformers_path = trf.__path__[0] - hf_model_path = f"{transformers_path}/models/{hf_model_name}" - self.model = AutoModelForSequenceClassification.from_pretrained( - hf_model_name, # The pretrained model. - num_labels=2, # The number of output labels--2 for binary classification. - output_attentions=False, # Whether the model returns attentions weights. - output_hidden_states=False, # Whether the model returns all hidden-states. - torchscript=True, - ) - self.model.config.pad_token_id = None - - def forward(self, tokens): - return self.model.forward(tokens)[0] - - -def get_hf_model(name, import_args): - model = HuggingFaceLanguage(name) - test_input = torch.randint(2, (int(import_args["batch_size"]), 128)) - actual_out = model(test_input) - return model, test_input, actual_out - - -##################### Hugging Face Seq2SeqLM Models ################################### - -# We use a maximum sequence length of 512 since this is the default used in the T5 config. -T5_MAX_SEQUENCE_LENGTH = 512 - - -class HFSeq2SeqLanguageModel(torch.nn.Module): - def __init__(self, model_name): - super().__init__() - from transformers import AutoTokenizer, T5Model - - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.tokenization_kwargs = { - "pad_to_multiple_of": T5_MAX_SEQUENCE_LENGTH, - "padding": True, - "return_tensors": "pt", - } - self.model = T5Model.from_pretrained(model_name, return_dict=True) - - def preprocess_input(self, text): - return self.tokenizer(text, **self.tokenization_kwargs) - - def forward(self, input_ids, decoder_input_ids): - return self.model.forward(input_ids, decoder_input_ids=decoder_input_ids)[0] - - -def get_hf_seq2seq_model(name, import_args): - m = HFSeq2SeqLanguageModel(name) - encoded_input_ids = m.preprocess_input( - "Studies have been shown that owning a dog is good for you" - ).input_ids - decoder_input_ids = m.preprocess_input("Studies show that").input_ids - decoder_input_ids = m.model._shift_right(decoder_input_ids) - - test_input = (encoded_input_ids, decoder_input_ids) - actual_out = m.forward(*test_input) - return m, test_input, actual_out - - -##################### Hugging Face CausalLM Models ################################### -from transformers import AutoTokenizer, AutoModelForCausalLM - - -def prepare_sentence_tokens(hf_model: str, sentence: str): - tokenizer = AutoTokenizer.from_pretrained( - hf_model, token="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" - ) - return torch.tensor([tokenizer.encode(sentence)]) - - -class HFCausalLM(torch.nn.Module): - def __init__(self, model_name: str): - super().__init__() - self.model = AutoModelForCausalLM.from_pretrained( - model_name, # The pretrained model name. - # The number of output labels--2 for binary classification. - num_labels=2, - # Whether the model returns attentions weights. - output_attentions=False, - # Whether the model returns all hidden-states. - output_hidden_states=False, - torchscript=True, - trust_remote_code=True, - token="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", - ) - self.model.eval() - - def forward(self, tokens): - return self.model.forward(tokens)[0] - - -def get_hf_causallm_model(name, import_args): - m = HFCausalLM(name) - test_input = prepare_sentence_tokens(name, "this project is very interesting") - actual_out = m.forward(test_input) - return m, test_input, actual_out - - -################################################################################ - -##################### Torch Vision Models ################################### - - -class VisionModule(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - self.train(False) - - def forward(self, input): - return self.model.forward(input) - - -def get_vision_model(torch_model, import_args): - import torchvision.models as models - - default_image_size = (224, 224) - modelname = torch_model - if modelname == "alexnet": - torch_model = models.alexnet(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "resnet18": - torch_model = models.resnet18(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "resnet50": - torch_model = models.resnet50(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "resnet50_fp16": - torch_model = models.resnet50(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "resnet50_fp16": - torch_model = models.resnet50(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "resnet101": - torch_model = models.resnet101(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "squeezenet1_0": - torch_model = models.squeezenet1_0(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "wide_resnet50_2": - torch_model = models.wide_resnet50_2(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "mobilenet_v3_small": - torch_model = models.mobilenet_v3_small(weights="DEFAULT") - input_image_size = default_image_size - if modelname == "mnasnet1_0": - torch_model = models.mnasnet1_0(weights="DEFAULT") - input_image_size = default_image_size - - model = VisionModule(torch_model) - test_input = torch.randn(int(import_args["batch_size"]), 3, *input_image_size) - actual_out = model(test_input) - return model, test_input, actual_out - - -def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name): - flags = [ - "--iree-input-type=torch", - "--mlir-print-debuginfo", - "--mlir-print-op-on-diagnostic=false", - "--iree-llvmcpu-target-cpu-features=host", - "--iree-llvmcpu-target-triple=x86_64-linux-gnu", - "--iree-stream-resource-index-bits=64", - "--iree-vm-target-index-bits=64", - "--iree-flow-inline-constants-max-byte-length=1", - ] - if device == "cpu": - flags.append("--iree-llvmcpu-enable-ukernels=all") - device = "llvm-cpu" - elif device == "vulkan": - flags.extend( - [ - "--iree-hal-target-backends=vulkan-spirv", - "--iree-vulkan-target-triple=" + target_triple, - "--iree-stream-resource-max-allocation-size=" + max_alloc, - ] - ) - elif device == "rocm": - flags.extend( - [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=" + target_triple, - "--iree-rocm-link-bc=true", - "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode", - "--iree-vm-bytecode-module-strip-source-map=true", - "--iree-opt-strip-assertions=true", - "--iree-vm-target-truncate-unsupported-floats", - ] - ) - elif device == "cuda": - flags.extend( - [ - "--iree-hal-target-backends=cuda", - "--iree-hal-cuda-llvm-target-arch=" + target_triple, - "--iree-vm-bytecode-module-strip-source-map=true", - "--iree-vm-target-truncate-unsupported-floats", - ] - ) - else: - print("incorrect device: ", device) - - flatbuffer_blob = ireec.compile_str( - module_str, - target_backends=[device], - extra_args=flags, - ) - with open(f"{safe_name}.vmfb", "wb+") as f: - f.write(flatbuffer_blob) - print("Saved to", safe_name + ".vmfb") - - -def classic_flow(model, model_name, input, out, run_e2e, expected_err): - vmfb_name = model_name.replace("/", "_") + ".vmfb" - model.get_compiled_module(save_to=vmfb_name) - - # if model is not supposed to run e2e, exit at this point (mlir has been uploaded) - if run_e2e is False: - assert expected_err > 0 - return - - # run inference using iree runtime - runner = vmfbRunner("local-task", vmfb_name) - inputs = [ireert.asdevicearray(runner.config.device, input)] - keys = list(runner.ctx.modules) - key = keys[len(keys) - 1] - results = runner.ctx.modules.__getattr__(key)["main"](*inputs) - err = utils.largest_error(out.cpu().detach().numpy(), results) - # cleanup - os.remove(vmfb_name) - # accuracy - assert err < expected_err - - -def param_flow(model, model_name, model_type, input, out, run_e2e, expected_err): - weight_name = model_name.replace("/", "_") + ".safetensors" - mapper = {} - utils.save_external_weights(mapper, model.model, "safetensors", weight_name) - - # seq2seq models differs from rest as it take two inputs (input_ids, decoder_input_ids) - if model_type == "hf_seq2seq": - - class Seq2SeqModule(CompiledModule): - params = export_parameters( - model.model, external=True, external_scope="", name_mapper=mapper.get - ) - - def main( - self, - inp1=AbstractTensor(*(input[0].shape), dtype=input[0].dtype), - inp2=AbstractTensor(*(input[1].shape), dtype=input[1].dtype), - ): - return jittable(model.model.forward)(inp1, inp2) - - inst = Seq2SeqModule(context=Context(), import_to="IMPORT") - module_str = str(CompiledModule.get_mlir_module(inst)) - else: - - class GlobalModule(CompiledModule): - params = export_parameters( - model.model, external=True, external_scope="", name_mapper=mapper.get - ) - - def main(self, inp=AbstractTensor(*input.shape, dtype=input.dtype)): - return jittable(model.model.forward)(inp) - - inst = GlobalModule(context=Context(), import_to="IMPORT") - module_str = str(CompiledModule.get_mlir_module(inst)) - - mlir_name = model_name.replace("/", "_") + ".mlir" - with open(mlir_name, "w+") as f: - f.write(module_str) - - model_name_upload = model_name.replace("/", "_") - turbine_tank.uploadToBlobStorage( - str(os.path.abspath(mlir_name)), - f"{model_name_upload}/{model_name_upload}-params.mlir", - ) - - os.remove(mlir_name) - - if run_e2e is False: - assert expected_err > 0 - return - - vmfb_name = model_name.replace("/", "_") - tank_util.compile_to_vmfb(module_str, "cpu", "", "", vmfb_name) - - # run inference using iree runtime - runner = vmfbRunner("local-task", vmfb_name + ".vmfb", weight_name) - inputs = [ireert.asdevicearray(runner.config.device, input)] - keys = list(runner.ctx.modules) - key = keys[len(keys) - 1] - results = runner.ctx.modules.__getattr__(key)["main"](*inputs) - err = utils.largest_error(out.cpu().detach().numpy(), results) - - # clean up - os.remove(vmfb_name + ".vmfb") - os.remove(weight_name) - - # accuracy - assert err < expected_err From 821ecf329317afec096c7c32c1ccf3f0a7075e5f Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 28 Feb 2024 14:14:15 -0800 Subject: [PATCH 15/20] add for schedulers too --- .../custom_models/sd_inference/schedulers.py | 12 ++++++++++++ models/turbine_models/tests/sd_test.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py index 97bd2418f..a9475d080 100644 --- a/models/turbine_models/custom_models/sd_inference/schedulers.py +++ b/models/turbine_models/custom_models/sd_inference/schedulers.py @@ -23,6 +23,8 @@ import safetensors import argparse +from turbine_models.turbine_tank import turbine_tank + parser = argparse.ArgumentParser() parser.add_argument( "--hf_auth_token", type=str, help="The Hugging Face auth token, required" @@ -111,6 +113,7 @@ def export_scheduler( device=None, target_triple=None, max_alloc=None, + upload_ir=False, ): mapper = {} utils.save_external_weights( @@ -145,6 +148,15 @@ def main( module_str = str(CompiledModule.get_mlir_module(inst)) safe_name = utils.create_safe_name(hf_model_name, "-scheduler") + if upload_ir: + with open(f"{safe_name}.mlir", "w+") as f: + f.write(module_str) + model_name_upload = hf_model_name.replace("/", "_") + model_name_upload = model_name_upload + "-scheduler" + turbine_tank.uploadToBlobStorage( + str(os.path.abspath(f"{safe_name}.mlir")), + f"{model_name_upload}/{model_name_upload}.mlir", + ) if compile_to != "vmfb": return module_str else: diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py index 01887db72..9d00fb9e5 100644 --- a/models/turbine_models/tests/sd_test.py +++ b/models/turbine_models/tests/sd_test.py @@ -247,6 +247,7 @@ def testExportVaeModelEncode(self): @unittest.expectedFailure def testExportPNDMScheduler(self): + upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload") with self.assertRaises(SystemExit) as cm: schedulers.export_scheduler( scheduler_module, @@ -260,6 +261,7 @@ def testExportPNDMScheduler(self): "safetensors", "stable_diffusion_v1_4_scheduler.safetensors", "cpu", + upload_ir=upload_ir_var == "upload", ) self.assertEqual(cm.exception.code, None) arguments[ From 8bcac10332bf66147b95841684878652288cd87b Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 28 Feb 2024 14:25:25 -0800 Subject: [PATCH 16/20] better var name --- models/turbine_models/model_builder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py index 2577a39b5..f91c65a08 100644 --- a/models/turbine_models/model_builder.py +++ b/models/turbine_models/model_builder.py @@ -29,7 +29,7 @@ def __init__( upload_ir=False, model=None, model_type: str = None, - run_e2e: bool = None, + compile_to_vmfb: bool = None, ) -> None: self.example_input = example_input self.hf_id = hf_id @@ -41,7 +41,7 @@ def __init__( self.tokenizer = None self.upload_ir = upload_ir self.model_type = model_type - self.run_e2e = run_e2e + self.compile_to_vmfb = compile_to_vmfb if self.model == None: self.build_model() @@ -88,7 +88,7 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule: f"{model_name_upload}/{model_name_upload}.mlir", ) os.remove(f"{safe_name}.mlir") - if self.run_e2e is not None and self.run_e2e is False: + if self.compile_to_vmfb is not None and self.compile_to_vmfb is False: return compiled_binary = module.compile(save_to=save_to) return compiled_binary From 7736efcd68055769217c1cff0f881791582985f4 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 28 Feb 2024 14:26:29 -0800 Subject: [PATCH 17/20] empty init file --- models/turbine_models/turbine_tank/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 models/turbine_models/turbine_tank/__init__.py diff --git a/models/turbine_models/turbine_tank/__init__.py b/models/turbine_models/turbine_tank/__init__.py new file mode 100644 index 000000000..e69de29bb From 45a3a275370bacd6e58e7637aeeb4efd6859fa4a Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 28 Feb 2024 16:29:40 -0800 Subject: [PATCH 18/20] clean checks --- models/turbine_models/model_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py index f91c65a08..035244534 100644 --- a/models/turbine_models/model_builder.py +++ b/models/turbine_models/model_builder.py @@ -71,7 +71,7 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule: Returns: aot.CompiledModule: The compiled module binary. """ - if self.model_type == "hf_seq2seq": + if self.model_type and self.model_type == "hf_seq2seq": module = aot.export(self.model, *self.example_input) else: module = aot.export(self.model, self.example_input) @@ -88,7 +88,7 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule: f"{model_name_upload}/{model_name_upload}.mlir", ) os.remove(f"{safe_name}.mlir") - if self.compile_to_vmfb is not None and self.compile_to_vmfb is False: + if self.compile_to_vmfb and not self.compile_to_vmfb: return compiled_binary = module.compile(save_to=save_to) return compiled_binary From a898364e7d6bc6bc61e4a20dd1a73cf4880be47c Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 28 Feb 2024 18:16:14 -0800 Subject: [PATCH 19/20] address nit --- models/turbine_models/custom_models/sd_inference/clip.py | 2 +- models/turbine_models/custom_models/sd_inference/schedulers.py | 2 +- models/turbine_models/custom_models/sd_inference/unet.py | 2 +- models/turbine_models/custom_models/sd_inference/vae.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py index 4cc5f91dd..b37eaf847 100644 --- a/models/turbine_models/custom_models/sd_inference/clip.py +++ b/models/turbine_models/custom_models/sd_inference/clip.py @@ -101,7 +101,7 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) model_name_upload = hf_model_name.replace("/", "_") - model_name_upload += "-clip" + model_name_upload += "_clip" turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), f"{model_name_upload}/{model_name_upload}.mlir", diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py index a9475d080..951187524 100644 --- a/models/turbine_models/custom_models/sd_inference/schedulers.py +++ b/models/turbine_models/custom_models/sd_inference/schedulers.py @@ -152,7 +152,7 @@ def main( with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) model_name_upload = hf_model_name.replace("/", "_") - model_name_upload = model_name_upload + "-scheduler" + model_name_upload = model_name_upload + "_scheduler" turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), f"{model_name_upload}/{model_name_upload}.mlir", diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py index 2c1556e84..1b351a078 100644 --- a/models/turbine_models/custom_models/sd_inference/unet.py +++ b/models/turbine_models/custom_models/sd_inference/unet.py @@ -131,7 +131,7 @@ def main( with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) model_name_upload = hf_model_name.replace("/", "_") - model_name_upload += "-unet" + model_name_upload += "_unet" turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), f"{model_name_upload}/{model_name_upload}.mlir", diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py index fcf9453b4..e169187ea 100644 --- a/models/turbine_models/custom_models/sd_inference/vae.py +++ b/models/turbine_models/custom_models/sd_inference/vae.py @@ -119,7 +119,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) model_name_upload = hf_model_name.replace("/", "_") - model_name_upload = model_name_upload + "-vae-" + variant + model_name_upload = model_name_upload + "_vae_" + variant turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), f"{model_name_upload}/{model_name_upload}.mlir", From 4d7edfa1c5450a74462579fc5b96660bffb1ed84 Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 28 Feb 2024 18:39:17 -0800 Subject: [PATCH 20/20] revert nit --- models/turbine_models/custom_models/sd_inference/clip.py | 2 +- models/turbine_models/custom_models/sd_inference/schedulers.py | 2 +- models/turbine_models/custom_models/sd_inference/unet.py | 2 +- models/turbine_models/custom_models/sd_inference/vae.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py index b37eaf847..4cc5f91dd 100644 --- a/models/turbine_models/custom_models/sd_inference/clip.py +++ b/models/turbine_models/custom_models/sd_inference/clip.py @@ -101,7 +101,7 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)): with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) model_name_upload = hf_model_name.replace("/", "_") - model_name_upload += "_clip" + model_name_upload += "-clip" turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), f"{model_name_upload}/{model_name_upload}.mlir", diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py index 951187524..6dafeb313 100644 --- a/models/turbine_models/custom_models/sd_inference/schedulers.py +++ b/models/turbine_models/custom_models/sd_inference/schedulers.py @@ -151,7 +151,7 @@ def main( if upload_ir: with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) - model_name_upload = hf_model_name.replace("/", "_") + model_name_upload = hf_model_name.replace("/", "-") model_name_upload = model_name_upload + "_scheduler" turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py index 1b351a078..398ed9bc5 100644 --- a/models/turbine_models/custom_models/sd_inference/unet.py +++ b/models/turbine_models/custom_models/sd_inference/unet.py @@ -130,7 +130,7 @@ def main( if upload_ir: with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) - model_name_upload = hf_model_name.replace("/", "_") + model_name_upload = hf_model_name.replace("/", "-") model_name_upload += "_unet" turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py index e169187ea..fcf9453b4 100644 --- a/models/turbine_models/custom_models/sd_inference/vae.py +++ b/models/turbine_models/custom_models/sd_inference/vae.py @@ -119,7 +119,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)): with open(f"{safe_name}.mlir", "w+") as f: f.write(module_str) model_name_upload = hf_model_name.replace("/", "_") - model_name_upload = model_name_upload + "_vae_" + variant + model_name_upload = model_name_upload + "-vae-" + variant turbine_tank.uploadToBlobStorage( str(os.path.abspath(f"{safe_name}.mlir")), f"{model_name_upload}/{model_name_upload}.mlir",