From 94c09e402adcdff60c235f603022fc74a6e123a5 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 14 Feb 2024 03:52:23 -0800
Subject: [PATCH 01/20] turbine tank

---
 models/requirements.txt                       |   2 +
 .../custom_models/sd_inference/clip.py        |  32 ++
 .../custom_models/sd_inference/unet.py        |  31 ++
 .../custom_models/sd_inference/utils.py       |   2 +-
 .../custom_models/sd_inference/vae.py         |  31 ++
 .../custom_models/stateless_llama.py          |  42 +-
 .../turbine_models/turbine_tank/run_models.py | 404 ++++++++++++++++++
 .../turbine_tank/turbine_tank.py              | 143 +++++++
 8 files changed, 681 insertions(+), 6 deletions(-)
 create mode 100644 models/turbine_models/turbine_tank/run_models.py
 create mode 100644 models/turbine_models/turbine_tank/turbine_tank.py

diff --git a/models/requirements.txt b/models/requirements.txt
index 4d2d16a56..99678eb68 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -5,3 +5,5 @@ transformers
 accelerate
 diffusers==0.24.0
 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
+# turbine tank downloading/uploading
+azure-storage-blob
diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index 996d5fb83..a2ab030ef 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -16,6 +16,7 @@
 import torch
 import torch._dynamo as dynamo
 from transformers import CLIPTextModel, CLIPTokenizer
+from turbine_models.turbine_tank import turbine_tank
 
 import argparse
 
@@ -46,6 +47,18 @@
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
+parser.add_argument(
+    "--download_ir",
+    action=argparse.BooleanOptionalAction,
+    default=True,
+    help="download IR from turbine tank",
+)
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="upload IR to turbine tank",
+)
 
 
 def export_clip_model(
@@ -57,6 +70,8 @@ def export_clip_model(
     device=None,
     target_triple=None,
     max_alloc=None,
+    download_ir=False,
+    upload_ir=False,
 ):
     # Load the tokenizer and text encoder to tokenize and encode the text.
     tokenizer = CLIPTokenizer.from_pretrained(
@@ -64,6 +79,10 @@ def export_clip_model(
         subfolder="tokenizer",
         token=hf_auth_token,
     )
+
+    if download_ir:
+        return turbine_tank.downloadModelArtifacts(hf_model_name + "-clip"), tokenizer
+
     text_encoder_model = CLIPTextModel.from_pretrained(
         hf_model_name,
         subfolder="text_encoder",
@@ -94,6 +113,15 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
 
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = utils.create_safe_name(hf_model_name, "-clip")
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload += "-clip"
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str, tokenizer
     else:
@@ -102,6 +130,8 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    if args.upload_ir and args.download_ir:
+        raise ValueError("upload_ir and download_ir can't both be true")
     mod_str, _ = export_clip_model(
         args.hf_model_name,
         args.hf_auth_token,
@@ -111,6 +141,8 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
         args.device,
         args.iree_target_triple,
         args.vulkan_max_allocation,
+        args.download_ir,
+        args.upload_ir,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 272c7af7f..d193ded78 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -18,6 +18,7 @@
 
 import safetensors
 import argparse
+from turbine_models.turbine_tank import turbine_tank
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -53,6 +54,18 @@
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
+parser.add_argument(
+    "--download_ir",
+    action=argparse.BooleanOptionalAction,
+    default=True,
+    help="download IR from turbine tank",
+)
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="upload IR to turbine tank",
+)
 
 
 class UnetModel(torch.nn.Module):
@@ -90,7 +103,12 @@ def export_unet_model(
     device=None,
     target_triple=None,
     max_alloc=None,
+    download_ir=False,
+    upload_ir=False,
 ):
+    if download_ir:
+        return turbine_tank.downloadModelArtifacts(hf_model_name + "-unet")
+
     mapper = {}
     utils.save_external_weights(
         mapper, unet_model, external_weights, external_weight_path
@@ -125,6 +143,15 @@ def main(
 
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = utils.create_safe_name(hf_model_name, "-unet")
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload += "-unet"
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str
     else:
@@ -133,6 +160,8 @@ def main(
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    if args.upload_ir and args.download_ir:
+        raise ValueError("upload_ir and download_ir can't both be true")
     unet_model = UnetModel(
         args.hf_model_name,
         args.hf_auth_token,
@@ -150,6 +179,8 @@ def main(
         args.device,
         args.iree_target_triple,
         args.vulkan_max_allocation,
+        args.download_ir,
+        args.upload_ir,
     )
     safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
     with open(f"{safe_name}.mlir", "w+") as f:
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 37787fd3a..c4898dac7 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -79,7 +79,7 @@ def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name):
     with open(f"{safe_name}.vmfb", "wb+") as f:
         f.write(flatbuffer_blob)
     print("Saved to", safe_name + ".vmfb")
-    exit()
+    return
 
 
 def create_safe_name(hf_model_name, model_name_str):
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 03ef85556..2aef05bcf 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -18,6 +18,7 @@
 
 import safetensors
 import argparse
+from turbine_models.turbine_tank import turbine_tank
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -54,6 +55,18 @@
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 parser.add_argument("--variant", type=str, default="decode")
+parser.add_argument(
+    "--download_ir",
+    action=argparse.BooleanOptionalAction,
+    default=True,
+    help="download IR from turbine tank",
+)
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="upload IR to turbine tank",
+)
 
 
 class VaeModel(torch.nn.Module):
@@ -89,7 +102,12 @@ def export_vae_model(
     target_triple=None,
     max_alloc=None,
     variant="decode",
+    download_ir=False,
+    upload_ir=False,
 ):
+    if download_ir:
+        return turbine_tank.downloadModelArtifacts(hf_model_name + "-" + variant)
+
     mapper = {}
     utils.save_external_weights(
         mapper, vae_model, external_weights, external_weight_path
@@ -113,6 +131,15 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
 
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = utils.create_safe_name(hf_model_name, "-vae")
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload = model_name_upload + "-" + variant
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str
     else:
@@ -121,6 +148,8 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    if args.upload_ir and args.download_ir:
+        raise ValueError("upload_ir and download_ir can't both be true")
     vae_model = VaeModel(
         args.hf_model_name,
         args.hf_auth_token,
@@ -139,6 +168,8 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
         args.iree_target_triple,
         args.vulkan_max_allocation,
         args.variant,
+        args.download_ir,
+        args.upload_ir,
     )
     safe_name = utils.create_safe_name(args.hf_model_name, "-vae")
     with open(f"{safe_name}.mlir", "w+") as f:
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index 762690603..5e4c7ca1a 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -2,6 +2,7 @@
 import sys
 import re
 import json
+from turbine_models.turbine_tank import turbine_tank
 
 os.environ["TORCH_LOGS"] = "dynamic"
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -61,6 +62,18 @@
     action="store_true",
     help="Compile LLM with StreamingLLM optimizations",
 )
+parser.add_argument(
+    "--download_ir",
+    action=argparse.BooleanOptionalAction,
+    default=True,
+    help="download IR from turbine tank",
+)
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="upload IR to turbine tank",
+)
 
 
 def generate_schema(num_layers):
@@ -107,7 +120,18 @@ def export_transformer_model(
     vulkan_max_allocation=None,
     streaming_llm=False,
     vmfb_path=None,
+    download_ir=False,
+    upload_ir=False,
 ):
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_model_name,
+        use_fast=False,
+        token=hf_auth_token,
+    )
+
+    if download_ir:
+        return turbine_tank.downloadModelArtifacts(hf_model_name), tokenizer
+
     mod = AutoModelForCausalLM.from_pretrained(
         hf_model_name,
         torch_dtype=torch.float,
@@ -121,11 +145,7 @@ def export_transformer_model(
     if precision == "f16":
         mod = mod.half()
         dtype = torch.float16
-    tokenizer = AutoTokenizer.from_pretrained(
-        hf_model_name,
-        use_fast=False,
-        token=hf_auth_token,
-    )
+
     # TODO: generate these values instead of magic numbers
     NUM_LAYERS = mod.config.num_hidden_layers
     HEADS = getattr(mod.config, "num_key_value_heads", None)
@@ -319,6 +339,14 @@ def evict_kvcache_space(self):
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str, tokenizer
     else:
@@ -382,6 +410,8 @@ def evict_kvcache_space(self):
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    if args.upload_ir and args.download_ir:
+        raise ValueError("upload_ir and download_ir can't both be true")
     mod_str, _ = export_transformer_model(
         args.hf_model_name,
         args.hf_auth_token,
@@ -395,6 +425,8 @@ def evict_kvcache_space(self):
         args.vulkan_max_allocation,
         args.streaming_llm,
         args.vmfb_path,
+        args.download_ir,
+        args.upload_ir,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)
diff --git a/models/turbine_models/turbine_tank/run_models.py b/models/turbine_models/turbine_tank/run_models.py
new file mode 100644
index 000000000..5d612c4ee
--- /dev/null
+++ b/models/turbine_models/turbine_tank/run_models.py
@@ -0,0 +1,404 @@
+# Copyright 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+from turbine_models.custom_models.sd_inference import (
+    clip,
+    clip_runner,
+    unet,
+    unet_runner,
+    vae,
+    vae_runner,
+)
+
+from turbine_models.custom_models.sd_inference import utils
+import torch
+import os
+import turbine_models.custom_models.stateless_llama as llama
+import difflib
+from turbine_models.turbine_tank import turbine_tank
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--download_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="download IR from turbine tank",
+)
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=True,
+    help="upload IR to turbine tank",
+)
+
+os.environ["TORCH_LOGS"] = "dynamic"
+from shark_turbine.aot import *
+from turbine_models.custom_models import llm_runner
+
+from turbine_models.gen_external_params.gen_external_params import (
+    gen_external_params,
+)
+
+DEFAULT_PROMPT = """<s>[INST] <<SYS>>
+Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> hi what are you? [/INST]
+"""
+
+
+def check_output_string(reference, output):
+    # Calculate and print diff
+    diff = difflib.unified_diff(
+        reference.splitlines(keepends=True),
+        output.splitlines(keepends=True),
+        fromfile="reference",
+        tofile="output",
+        lineterm="",
+    )
+    return "".join(diff)
+
+
+def run_llama_model(download_ir=False, upload_ir=True):
+    if not download_ir:
+        gen_external_params(
+            hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            hf_auth_token=None,
+        )
+    llama.export_transformer_model(
+        hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+        hf_auth_token=None,
+        compile_to="vmfb",
+        external_weights="safetensors",
+        # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized
+        quantization="int4",
+        precision="f16",
+        device="llvm-cpu",
+        target_triple="host",
+        download_ir=download_ir,
+        upload_ir=upload_ir,
+    )
+
+    if download_ir:
+        return
+
+    torch_str_cache_path = (
+        f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_f16_int4.txt"
+    )
+    # if cached, just read
+    if os.path.exists(torch_str_cache_path):
+        with open(torch_str_cache_path, "r") as f:
+            torch_str = f.read()
+    else:
+        torch_str = llm_runner.run_torch_llm(
+            "Trelis/Llama-2-7b-chat-hf-function-calling-v2", None, DEFAULT_PROMPT
+        )
+
+        with open(torch_str_cache_path, "w") as f:
+            f.write(torch_str)
+
+    turbine_str = llm_runner.run_llm(
+        "local-task",
+        DEFAULT_PROMPT,
+        "Llama_2_7b_chat_hf_function_calling_v2.vmfb",
+        "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+        None,
+        f"Llama_2_7b_chat_hf_function_calling_v2_f16_int4.safetensors",
+    )
+
+    result = check_output_string(torch_str, turbine_str)
+
+    # clean up
+    os.remove("Llama_2_7b_chat_hf_function_calling_v2_f16_int4.safetensors")
+    os.remove("Llama_2_7b_chat_hf_function_calling_v2.vmfb")
+    os.remove("Llama_2_7b_chat_hf_function_calling_v2.mlir")
+
+    return result
+
+
+arguments = {
+    "hf_auth_token": None,
+    "hf_model_name": "CompVis/stable-diffusion-v1-4",
+    "batch_size": 1,
+    "height": 512,
+    "width": 512,
+    "run_vmfb": True,
+    "compile_to": None,
+    "external_weight_path": "",
+    "vmfb_path": "",
+    "external_weights": None,
+    "device": "local-task",
+    "iree_target_triple": "",
+    "vulkan_max_allocation": "4294967296",
+    "prompt": "a photograph of an astronaut riding a horse",
+    "in_channels": 4,
+}
+
+
+unet_model = unet.UnetModel(
+    # This is a public model, so no auth required
+    "CompVis/stable-diffusion-v1-4",
+    None,
+)
+
+vae_model = vae.VaeModel(
+    # This is a public model, so no auth required
+    "CompVis/stable-diffusion-v1-4",
+    None,
+)
+
+
+def run_clip_model(download_ir=False, upload_ir=True):
+    clip.export_clip_model(
+        # This is a public model, so no auth required
+        "CompVis/stable-diffusion-v1-4",
+        None,
+        "vmfb",
+        "safetensors",
+        "stable_diffusion_v1_4_clip.safetensors",
+        "cpu",
+        download_ir=download_ir,
+        upload_ir=upload_ir,
+    )
+
+    if download_ir:
+        return
+
+    arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors"
+    arguments["vmfb_path"] = "stable_diffusion_v1_4_clip.vmfb"
+    turbine = clip_runner.run_clip(
+        arguments["device"],
+        arguments["prompt"],
+        arguments["vmfb_path"],
+        arguments["hf_model_name"],
+        arguments["hf_auth_token"],
+        arguments["external_weight_path"],
+    )
+    torch_output = clip_runner.run_torch_clip(
+        arguments["hf_model_name"], arguments["hf_auth_token"], arguments["prompt"]
+    )
+    err = utils.largest_error(torch_output, turbine[0])
+    if err < 9e-5:
+        result = "CLIP SUCCESS: " + str(err)
+    else:
+        result = "CLIP FAILURE: " + str(err)
+
+    # clean up
+    os.remove("stable_diffusion_v1_4_clip.safetensors")
+    os.remove("stable_diffusion_v1_4_clip.vmfb")
+    os.remove("stable_diffusion_v1_4_clip.mlir")
+
+    return result
+
+
+def run_unet_model(download_ir=False, upload_ir=True):
+    unet.export_unet_model(
+        unet_model,
+        # This is a public model, so no auth required
+        "CompVis/stable-diffusion-v1-4",
+        arguments["batch_size"],
+        arguments["height"],
+        arguments["width"],
+        None,
+        "vmfb",
+        "safetensors",
+        "stable_diffusion_v1_4_unet.safetensors",
+        "cpu",
+        download_ir=download_ir,
+        upload_ir=upload_ir,
+    )
+
+    if download_ir:
+        return
+
+    arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors"
+    arguments["vmfb_path"] = "stable_diffusion_v1_4_unet.vmfb"
+    sample = torch.rand(
+        arguments["batch_size"],
+        arguments["in_channels"],
+        arguments["height"] // 8,
+        arguments["width"] // 8,
+        dtype=torch.float32,
+    )
+    timestep = torch.zeros(1, dtype=torch.float32)
+    encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32)
+
+    turbine = unet_runner.run_unet(
+        arguments["device"],
+        sample,
+        timestep,
+        encoder_hidden_states,
+        arguments["vmfb_path"],
+        arguments["hf_model_name"],
+        arguments["hf_auth_token"],
+        arguments["external_weight_path"],
+    )
+    torch_output = unet_runner.run_torch_unet(
+        arguments["hf_model_name"],
+        arguments["hf_auth_token"],
+        sample,
+        timestep,
+        encoder_hidden_states,
+    )
+    err = utils.largest_error(torch_output, turbine)
+    if err < 9e-5:
+        result = "UNET SUCCESS: " + str(err)
+    else:
+        result = "UNET FAILURE: " + str(err)
+
+    # clean up
+    os.remove("stable_diffusion_v1_4_unet.safetensors")
+    os.remove("stable_diffusion_v1_4_unet.vmfb")
+    os.remove("stable_diffusion_v1_4_unet.mlir")
+
+    return result
+
+
+def run_vae_decode(download_ir=False, upload_ir=True):
+    vae.export_vae_model(
+        vae_model,
+        # This is a public model, so no auth required
+        "CompVis/stable-diffusion-v1-4",
+        arguments["batch_size"],
+        arguments["height"],
+        arguments["width"],
+        None,
+        "vmfb",
+        "safetensors",
+        "stable_diffusion_v1_4_vae.safetensors",
+        "cpu",
+        variant="decode",
+        download_ir=download_ir,
+        upload_ir=upload_ir,
+    )
+
+    if download_ir:
+        return
+
+    arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
+    arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb"
+    example_input = torch.rand(
+        arguments["batch_size"],
+        4,
+        arguments["height"] // 8,
+        arguments["width"] // 8,
+        dtype=torch.float32,
+    )
+    turbine = vae_runner.run_vae(
+        arguments["device"],
+        example_input,
+        arguments["vmfb_path"],
+        arguments["hf_model_name"],
+        arguments["hf_auth_token"],
+        arguments["external_weight_path"],
+    )
+    torch_output = vae_runner.run_torch_vae(
+        arguments["hf_model_name"],
+        arguments["hf_auth_token"],
+        "decode",
+        example_input,
+    )
+    err = utils.largest_error(torch_output, turbine)
+    if err < 9e-5:
+        result = "VAE DECODE SUCCESS: " + str(err)
+    else:
+        result = "VAE DECODE FAILURE: " + str(err)
+
+    # clean up
+    os.remove("stable_diffusion_v1_4_vae.safetensors")
+    os.remove("stable_diffusion_v1_4_vae.vmfb")
+    os.remove("stable_diffusion_v1_4_vae.mlir")
+
+    return result
+
+
+def run_vae_encode(download_ir=False, upload_ir=True):
+    vae.export_vae_model(
+        vae_model,
+        # This is a public model, so no auth required
+        "CompVis/stable-diffusion-v1-4",
+        arguments["batch_size"],
+        arguments["height"],
+        arguments["width"],
+        None,
+        "vmfb",
+        "safetensors",
+        "stable_diffusion_v1_4_vae.safetensors",
+        "cpu",
+        variant="encode",
+        download_ir=download_ir,
+        upload_ir=upload_ir,
+    )
+
+    if download_ir:
+        return
+
+    arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
+    arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb"
+    example_input = torch.rand(
+        arguments["batch_size"],
+        3,
+        arguments["height"],
+        arguments["width"],
+        dtype=torch.float32,
+    )
+    turbine = vae_runner.run_vae(
+        arguments["device"],
+        example_input,
+        arguments["vmfb_path"],
+        arguments["hf_model_name"],
+        arguments["hf_auth_token"],
+        arguments["external_weight_path"],
+    )
+    torch_output = vae_runner.run_torch_vae(
+        arguments["hf_model_name"],
+        arguments["hf_auth_token"],
+        "encode",
+        example_input,
+    )
+    err = utils.largest_error(torch_output, turbine)
+    if err < 2e-3:
+        result = "VAE ENCODE SUCCESS: " + str(err)
+    else:
+        result = "VAE ENCODE FAILURE: " + str(err)
+
+    # clean up
+    os.remove("stable_diffusion_v1_4_vae.safetensors")
+    os.remove("stable_diffusion_v1_4_vae.vmfb")
+    os.remove("stable_diffusion_v1_4_vae.mlir")
+
+    return result
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.upload_ir and args.download_ir:
+        raise ValueError("upload_ir and download_ir can't both be true")
+
+    if args.upload_ir:
+        result = "Turbine Tank Results\n"
+        llama_result = run_llama_model(args.download_ir, args.upload_ir)
+        result += llama_result + "\n"
+        clip_result = run_clip_model(args.download_ir, args.upload_ir)
+        result += clip_result + "\n"
+        unet_result = run_unet_model(args.download_ir, args.upload_ir)
+        result += unet_result + "\n"
+        vae_decode_result = run_vae_decode(args.download_ir, args.upload_ir)
+        result += vae_decode_result + "\n"
+        vae_encode_result = run_vae_encode(args.download_ir, args.upload_ir)
+        result += vae_encode_result + "\n"
+        f = open("daily_report.txt", "a")
+        f.write(result)
+        f.close()
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath("daily_report.txt")), "daily_report.txt"
+        )
+        os.remove("daily_report.txt")
+    else:
+        run_llama_model(args.download_ir, args.upload_ir)
+        run_clip_model(args.download_ir, args.upload_ir)
+        run_unet_model(args.download_ir, args.upload_ir)
+        run_vae_decode(args.download_ir, args.upload_ir)
+        run_vae_encode(args.download_ir, args.upload_ir)
diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py
new file mode 100644
index 000000000..92a294e3e
--- /dev/null
+++ b/models/turbine_models/turbine_tank/turbine_tank.py
@@ -0,0 +1,143 @@
+from azure.storage.blob import BlobServiceClient
+
+import subprocess
+import datetime
+import os
+from pathlib import Path
+
+custom_path = os.getenv("TURBINE_TANK_CACHE_DIR")
+if custom_path is not None:
+    if not os.path.exists(custom_path):
+        os.mkdir(custom_path)
+
+    WORKDIR = custom_path
+
+    print(f"Using {WORKDIR} as local turbine_tank cache directory.")
+else:
+    WORKDIR = os.path.join(str(Path.home()), ".local/turbine_tank/")
+    print(
+        f"turbine_tank local cache is located at {WORKDIR} . You may change this by assigning the TURBINE_TANK_CACHE_DIR environment variable."
+    )
+os.makedirs(WORKDIR, exist_ok=True)
+
+storage_account_key = "XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A=="
+storage_account_name = "tankturbine"
+connection_string = "DefaultEndpointsProtocol=https;AccountName=tankturbine;AccountKey=XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==;EndpointSuffix=core.windows.net"
+container_name = "tankturbine"
+
+
+def get_short_git_sha() -> str:
+    try:
+        return (
+            subprocess.check_output(["git", "rev-parse", "--short", "HEAD"])
+            .decode("utf-8")
+            .strip()
+        )
+    except FileNotFoundError:
+        return None
+
+
+def uploadToBlobStorage(file_path, file_name):
+    # create our prefix (we use this to keep track of when and what version of turbine is being used)
+    today = str(datetime.date.today())
+    commit = get_short_git_sha()
+    prefix = today + "_" + commit
+    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
+    blob_client = blob_service_client.get_blob_client(
+        container=container_name, blob=prefix + "/" + file_name
+    )
+    blob = blob_client.from_connection_string(
+        conn_str=connection_string,
+        container_name=container_name,
+        blob_name=blob_client.blob_name,
+    )
+    # we check to see if we already uploaded the blob (don't want to duplicate)
+    if blob.exists():
+        print(
+            f"model artifacts have already been uploaded for {today} on the same github commit ({commit})"
+        )
+        return
+    # upload to azure storage container tankturbine
+    with open(file_path, "rb") as data:
+        blob_client.upload_blob(data)
+    print(f"Uploaded {file_name}.")
+
+
+def checkAndRemoveIfDownloadedOld(model_name: str, model_dir: str, prefix: str):
+    if os.path.isdir(model_dir) and len(os.listdir(model_dir)) == 1:
+        for item in os.listdir(model_dir):
+            item_path = os.path.join(model_dir, item)
+            # model artifacts already downloaded and up to date
+            # we check if model artifacts are behind using the prefix (day + git_sha)
+            if os.path.isdir(item_path) and item == prefix:
+                return True
+            # model artifacts are behind, so remove for new download
+            if os.path.isdir(item_path) and os.path.isfile(
+                os.path.join(item_path, model_name + ".mlir")
+            ):
+                os.remove(os.path.join(item_path, model_name + ".mlir"))
+                os.rmdir(item_path)
+                return False
+    # did not downloaded this model artifacts yet
+    return False
+
+
+def download_public_folder(model_name: str, prefix: str, model_dir: str):
+    """Downloads a folder of blobs in azure container."""
+    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
+    container_client = blob_service_client.get_container_client(
+        container=container_name
+    )
+    blob_list = container_client.list_blobs(name_starts_with=prefix)
+
+    # go through the blobs with our target prefix
+    # example prefix: "2024-02-13_26d6428/CompVis_stable-diffusion-v1-4-clip"
+    for blob in blob_list:
+        blob_client = blob_service_client.get_blob_client(
+            container=container_name, blob=blob.name
+        )
+        # create path if directory doesn't exist locally
+        dest_path = model_dir
+        if not os.path.isdir(dest_path):
+            os.makedirs(dest_path)
+        # download blob into local turbine tank cache
+        with open(
+            file=os.path.join(model_dir, model_name + ".mlir"), mode="wb"
+        ) as sample_blob:
+            download_stream = blob_client.download_blob()
+            sample_blob.write(download_stream.readall())
+
+
+def downloadModelArtifacts(model_name: str) -> str:
+    model_name = model_name.replace("/", "_")
+    container_client = BlobServiceClient.from_connection_string(
+        connection_string
+    ).get_container_client(container=container_name)
+    blob_list = container_client.list_blobs()
+    # get the latest blob uploaded to turbine tank (can't use [] notation for blob_list)
+    for blob in blob_list:
+        latest_blob = blob
+    # get the prefix for the latest blob (2024-02-13_26d6428)
+    download_latest_prefix = latest_blob.name.split("/")[0]
+    model_dir = os.path.join(WORKDIR, model_name)
+    # check if we already downloaded the model artifacts for this day + commit
+    exists = checkAndRemoveIfDownloadedOld(
+        model_name=model_name, model_dir=model_dir, prefix=download_latest_prefix
+    )
+    if exists:
+        print("Already downloaded most recent version")
+        return "NA"
+    # download the model artifacts (passing in the model name, path in azure storage to model artifacts, local directory to store)
+    download_public_folder(
+        model_name,
+        download_latest_prefix + "/" + model_name,
+        os.path.join(model_dir, download_latest_prefix),
+    )
+    model_dir = os.path.join(WORKDIR, model_name + "/" + download_latest_prefix)
+    mlir_filename = os.path.join(model_dir, model_name + ".mlir")
+    print(
+        f"Verifying that model artifacts were downloaded successfully to {mlir_filename}..."
+    )
+    assert os.path.exists(mlir_filename), f"MLIR not found at {mlir_filename}"
+
+    return mlir_filename

From 2139ab0633c8ea892e678da488ad1069c1a64744 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 14 Feb 2024 04:16:57 -0800
Subject: [PATCH 02/20] azure dep

---
 models/setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/setup.py b/models/setup.py
index cf0ed2d6b..60ae10c4c 100644
--- a/models/setup.py
+++ b/models/setup.py
@@ -61,5 +61,6 @@ def load_version_info():
         "transformers",
         "accelerate",
         "diffusers==0.24.0",
+        "azure-storage-blob",
     ],
 )

From d323e630e08beee28ee1f2ae9a9b4926f4579636 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 14 Feb 2024 10:48:42 -0800
Subject: [PATCH 03/20] update compile_to_vmfb and sort for download

---
 .../custom_models/sd_inference/clip.py               |  4 +++-
 .../custom_models/sd_inference/unet.py               |  4 +++-
 .../custom_models/sd_inference/utils.py              |  9 +++++++--
 .../turbine_models/custom_models/sd_inference/vae.py |  4 +++-
 models/turbine_models/turbine_tank/turbine_tank.py   | 12 ++++++++++++
 5 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index a2ab030ef..860bf0a15 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -125,7 +125,9 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
     if compile_to != "vmfb":
         return module_str, tokenizer
     else:
-        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
+        utils.compile_to_vmfb(
+            module_str, device, target_triple, max_alloc, safe_name, upload_ir
+        )
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index d193ded78..2b49a1792 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -155,7 +155,9 @@ def main(
     if compile_to != "vmfb":
         return module_str
     else:
-        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
+        utils.compile_to_vmfb(
+            module_str, device, target_triple, max_alloc, safe_name, upload_ir
+        )
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index c4898dac7..14197dd3a 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -26,7 +26,9 @@ def largest_error(array1, array2):
     return max_error
 
 
-def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name):
+def compile_to_vmfb(
+    module_str, device, target_triple, max_alloc, safe_name, upload_ir=False
+):
     flags = [
         "--iree-input-type=torch",
         "--mlir-print-debuginfo",
@@ -79,7 +81,10 @@ def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name):
     with open(f"{safe_name}.vmfb", "wb+") as f:
         f.write(flatbuffer_blob)
     print("Saved to", safe_name + ".vmfb")
-    return
+    if upload_ir:
+        return
+    else:
+        exit()
 
 
 def create_safe_name(hf_model_name, model_name_str):
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 2aef05bcf..77a2ced3a 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -143,7 +143,9 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
     if compile_to != "vmfb":
         return module_str
     else:
-        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
+        utils.compile_to_vmfb(
+            module_str, device, target_triple, max_alloc, safe_name, upload_ir
+        )
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py
index 92a294e3e..3d06d8009 100644
--- a/models/turbine_models/turbine_tank/turbine_tank.py
+++ b/models/turbine_models/turbine_tank/turbine_tank.py
@@ -4,6 +4,7 @@
 import datetime
 import os
 from pathlib import Path
+from functools import cmp_to_key
 
 custom_path = os.getenv("TURBINE_TANK_CACHE_DIR")
 if custom_path is not None:
@@ -108,6 +109,16 @@ def download_public_folder(model_name: str, prefix: str, model_dir: str):
             sample_blob.write(download_stream.readall())
 
 
+# sort blobs by last modified
+def compare(item1, item2):
+    if item1.last_modified < item2.last_modified:
+        return -1
+    elif item1.last_modified < item2.last_modified:
+        return 1
+    else:
+        return 0
+
+
 def downloadModelArtifacts(model_name: str) -> str:
     model_name = model_name.replace("/", "_")
     container_client = BlobServiceClient.from_connection_string(
@@ -115,6 +126,7 @@ def downloadModelArtifacts(model_name: str) -> str:
     ).get_container_client(container=container_name)
     blob_list = container_client.list_blobs()
     # get the latest blob uploaded to turbine tank (can't use [] notation for blob_list)
+    blob_list = sorted(blob_list, key=cmp_to_key(compare))
     for blob in blob_list:
         latest_blob = blob
     # get the prefix for the latest blob (2024-02-13_26d6428)

From d584a2073ec5636518efd77fc89c0810c616e0f9 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 21 Feb 2024 02:05:45 -0800
Subject: [PATCH 04/20] update tank to add 30 models using general flow +
 leverage existing testing

---
 models/requirements.txt                       |   2 +
 models/setup.py                               |   1 +
 .../custom_models/sd_inference/clip.py        |  13 -
 .../custom_models/sd_inference/unet.py        |  12 -
 .../custom_models/sd_inference/utils.py       |   5 +-
 .../custom_models/sd_inference/vae.py         |  14 +-
 .../custom_models/stateless_llama.py          |  13 -
 models/turbine_models/model_builder.py        |  34 +-
 models/turbine_models/tests/sd_test.py        |   8 +
 .../tests/stateless_llama_test.py             |   3 +
 .../turbine_models/turbine_tank/run_models.py | 404 ------------------
 .../turbine_models/turbine_tank/run_tank.py   |  61 +++
 .../turbine_models/turbine_tank/tank_test.py  | 143 +++++++
 .../turbine_models/turbine_tank/tank_util.py  | 260 +++++++++++
 .../turbine_tank/turbine_tank.py              |  12 +-
 15 files changed, 522 insertions(+), 463 deletions(-)
 delete mode 100644 models/turbine_models/turbine_tank/run_models.py
 create mode 100644 models/turbine_models/turbine_tank/run_tank.py
 create mode 100644 models/turbine_models/turbine_tank/tank_test.py
 create mode 100644 models/turbine_models/turbine_tank/tank_util.py

diff --git a/models/requirements.txt b/models/requirements.txt
index 99678eb68..132b52309 100644
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -7,3 +7,5 @@ diffusers==0.24.0
 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
 # turbine tank downloading/uploading
 azure-storage-blob
+# microsoft/phi model
+einops
diff --git a/models/setup.py b/models/setup.py
index 60ae10c4c..7c5dcfa97 100644
--- a/models/setup.py
+++ b/models/setup.py
@@ -62,5 +62,6 @@ def load_version_info():
         "accelerate",
         "diffusers==0.24.0",
         "azure-storage-blob",
+        "einops",
     ],
 )
diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index 860bf0a15..d7ed96561 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -47,12 +47,6 @@
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
-parser.add_argument(
-    "--download_ir",
-    action=argparse.BooleanOptionalAction,
-    default=True,
-    help="download IR from turbine tank",
-)
 parser.add_argument(
     "--upload_ir",
     action=argparse.BooleanOptionalAction,
@@ -70,7 +64,6 @@ def export_clip_model(
     device=None,
     target_triple=None,
     max_alloc=None,
-    download_ir=False,
     upload_ir=False,
 ):
     # Load the tokenizer and text encoder to tokenize and encode the text.
@@ -80,9 +73,6 @@ def export_clip_model(
         token=hf_auth_token,
     )
 
-    if download_ir:
-        return turbine_tank.downloadModelArtifacts(hf_model_name + "-clip"), tokenizer
-
     text_encoder_model = CLIPTextModel.from_pretrained(
         hf_model_name,
         subfolder="text_encoder",
@@ -132,8 +122,6 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    if args.upload_ir and args.download_ir:
-        raise ValueError("upload_ir and download_ir can't both be true")
     mod_str, _ = export_clip_model(
         args.hf_model_name,
         args.hf_auth_token,
@@ -143,7 +131,6 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
         args.device,
         args.iree_target_triple,
         args.vulkan_max_allocation,
-        args.download_ir,
         args.upload_ir,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 2b49a1792..f65af556c 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -54,12 +54,6 @@
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
-parser.add_argument(
-    "--download_ir",
-    action=argparse.BooleanOptionalAction,
-    default=True,
-    help="download IR from turbine tank",
-)
 parser.add_argument(
     "--upload_ir",
     action=argparse.BooleanOptionalAction,
@@ -103,11 +97,8 @@ def export_unet_model(
     device=None,
     target_triple=None,
     max_alloc=None,
-    download_ir=False,
     upload_ir=False,
 ):
-    if download_ir:
-        return turbine_tank.downloadModelArtifacts(hf_model_name + "-unet")
 
     mapper = {}
     utils.save_external_weights(
@@ -162,8 +153,6 @@ def main(
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    if args.upload_ir and args.download_ir:
-        raise ValueError("upload_ir and download_ir can't both be true")
     unet_model = UnetModel(
         args.hf_model_name,
         args.hf_auth_token,
@@ -181,7 +170,6 @@ def main(
         args.device,
         args.iree_target_triple,
         args.vulkan_max_allocation,
-        args.download_ir,
         args.upload_ir,
     )
     safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 14197dd3a..3d5d2a0a2 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -81,10 +81,7 @@ def compile_to_vmfb(
     with open(f"{safe_name}.vmfb", "wb+") as f:
         f.write(flatbuffer_blob)
     print("Saved to", safe_name + ".vmfb")
-    if upload_ir:
-        return
-    else:
-        exit()
+    exit()
 
 
 def create_safe_name(hf_model_name, model_name_str):
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 77a2ced3a..5d62edf1f 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -55,12 +55,6 @@
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 parser.add_argument("--variant", type=str, default="decode")
-parser.add_argument(
-    "--download_ir",
-    action=argparse.BooleanOptionalAction,
-    default=True,
-    help="download IR from turbine tank",
-)
 parser.add_argument(
     "--upload_ir",
     action=argparse.BooleanOptionalAction,
@@ -102,11 +96,8 @@ def export_vae_model(
     target_triple=None,
     max_alloc=None,
     variant="decode",
-    download_ir=False,
     upload_ir=False,
 ):
-    if download_ir:
-        return turbine_tank.downloadModelArtifacts(hf_model_name + "-" + variant)
 
     mapper = {}
     utils.save_external_weights(
@@ -135,7 +126,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
         model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload = model_name_upload + "-" + variant
+        model_name_upload = model_name_upload + "-vae-" + variant
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
             f"{model_name_upload}/{model_name_upload}.mlir",
@@ -150,8 +141,6 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    if args.upload_ir and args.download_ir:
-        raise ValueError("upload_ir and download_ir can't both be true")
     vae_model = VaeModel(
         args.hf_model_name,
         args.hf_auth_token,
@@ -170,7 +159,6 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
         args.iree_target_triple,
         args.vulkan_max_allocation,
         args.variant,
-        args.download_ir,
         args.upload_ir,
     )
     safe_name = utils.create_safe_name(args.hf_model_name, "-vae")
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index 5e4c7ca1a..3fa19c12e 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -62,12 +62,6 @@
     action="store_true",
     help="Compile LLM with StreamingLLM optimizations",
 )
-parser.add_argument(
-    "--download_ir",
-    action=argparse.BooleanOptionalAction,
-    default=True,
-    help="download IR from turbine tank",
-)
 parser.add_argument(
     "--upload_ir",
     action=argparse.BooleanOptionalAction,
@@ -120,7 +114,6 @@ def export_transformer_model(
     vulkan_max_allocation=None,
     streaming_llm=False,
     vmfb_path=None,
-    download_ir=False,
     upload_ir=False,
 ):
     tokenizer = AutoTokenizer.from_pretrained(
@@ -129,9 +122,6 @@ def export_transformer_model(
         token=hf_auth_token,
     )
 
-    if download_ir:
-        return turbine_tank.downloadModelArtifacts(hf_model_name), tokenizer
-
     mod = AutoModelForCausalLM.from_pretrained(
         hf_model_name,
         torch_dtype=torch.float,
@@ -410,8 +400,6 @@ def evict_kvcache_space(self):
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    if args.upload_ir and args.download_ir:
-        raise ValueError("upload_ir and download_ir can't both be true")
     mod_str, _ = export_transformer_model(
         args.hf_model_name,
         args.hf_auth_token,
@@ -425,7 +413,6 @@ def evict_kvcache_space(self):
         args.vulkan_max_allocation,
         args.streaming_llm,
         args.vmfb_path,
-        args.download_ir,
         args.upload_ir,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py
index 22139ca64..6f5c8b578 100644
--- a/models/turbine_models/model_builder.py
+++ b/models/turbine_models/model_builder.py
@@ -1,6 +1,9 @@
 from transformers import AutoModel, AutoTokenizer, AutoConfig
 import torch
 import shark_turbine.aot as aot
+from turbine_models.turbine_tank import turbine_tank
+import os
+import re
 
 
 class HFTransformerBuilder:
@@ -23,6 +26,10 @@ def __init__(
         auto_tokenizer: AutoTokenizer = None,
         auto_config: AutoConfig = None,
         hf_auth_token=None,
+        upload_ir=False,
+        model=None,
+        model_type: str = None,
+        run_e2e: bool = None,
     ) -> None:
         self.example_input = example_input
         self.hf_id = hf_id
@@ -30,9 +37,13 @@ def __init__(
         self.auto_tokenizer = auto_tokenizer
         self.auto_config = auto_config
         self.hf_auth_token = hf_auth_token
-        self.model = None
+        self.model = model
         self.tokenizer = None
-        self.build_model()
+        self.upload_ir = upload_ir
+        self.model_type = model_type
+        self.run_e2e = run_e2e
+        if self.model == None:
+            self.build_model()
 
     def build_model(self) -> None:
         """
@@ -59,6 +70,23 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule:
         Returns:
             aot.CompiledModule: The compiled module binary.
         """
-        module = aot.export(self.model, self.example_input)
+        if self.model_type == "hf_seq2seq":
+            module = aot.export(self.model, *self.example_input)
+        else:
+            module = aot.export(self.model, self.example_input)
+        module_str = str(module.mlir_module)
+        safe_name = self.hf_id.split("/")[-1].strip()
+        safe_name = re.sub("-", "_", safe_name)
+        if self.upload_ir:
+            with open(f"{safe_name}.mlir", "w+") as f:
+                f.write(module_str)
+            model_name_upload = self.hf_id.replace("/", "_")
+            turbine_tank.uploadToBlobStorage(
+                str(os.path.abspath(f"{safe_name}.mlir")),
+                f"{model_name_upload}/{model_name_upload}.mlir",
+            )
+            os.remove(f"{safe_name}.mlir")
+        if self.run_e2e is not None and self.run_e2e is False:
+            return
         compiled_binary = module.compile(save_to=save_to)
         return compiled_binary
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index 125f97d82..a00e4bb2b 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -55,6 +55,7 @@
 
 class StableDiffusionTest(unittest.TestCase):
     def testExportClipModel(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             clip.export_clip_model(
                 # This is a public model, so no auth required
@@ -64,6 +65,7 @@ def testExportClipModel(self):
                 "safetensors",
                 "stable_diffusion_v1_4_clip.safetensors",
                 "cpu",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors"
@@ -85,6 +87,7 @@ def testExportClipModel(self):
         os.remove("stable_diffusion_v1_4_clip.vmfb")
 
     def testExportUnetModel(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             unet.export_unet_model(
                 unet_model,
@@ -98,6 +101,7 @@ def testExportUnetModel(self):
                 "safetensors",
                 "stable_diffusion_v1_4_unet.safetensors",
                 "cpu",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors"
@@ -135,6 +139,7 @@ def testExportUnetModel(self):
         os.remove("stable_diffusion_v1_4_unet.vmfb")
 
     def testExportVaeModelDecode(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             vae.export_vae_model(
                 vae_model,
@@ -149,6 +154,7 @@ def testExportVaeModelDecode(self):
                 "stable_diffusion_v1_4_vae.safetensors",
                 "cpu",
                 variant="decode",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
@@ -180,6 +186,7 @@ def testExportVaeModelDecode(self):
         os.remove("stable_diffusion_v1_4_vae.vmfb")
 
     def testExportVaeModelEncode(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             vae.export_vae_model(
                 vae_model,
@@ -194,6 +201,7 @@ def testExportVaeModelEncode(self):
                 "stable_diffusion_v1_4_vae.safetensors",
                 "cpu",
                 variant="encode",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py
index 574902101..c72c55e55 100644
--- a/models/turbine_models/tests/stateless_llama_test.py
+++ b/models/turbine_models/tests/stateless_llama_test.py
@@ -53,6 +53,8 @@ def test_vmfb_comparison(self):
         For VMFB, quantization can be int4 or None, but right now only using none for compatibility with torch.
         """
 
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
+
         llama.export_transformer_model(
             hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             hf_auth_token=None,
@@ -63,6 +65,7 @@ def test_vmfb_comparison(self):
             precision=precision,
             device="llvm-cpu",
             target_triple="host",
+            upload_ir=upload_ir_var == "upload",
         )
 
         torch_str_cache_path = f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt"
diff --git a/models/turbine_models/turbine_tank/run_models.py b/models/turbine_models/turbine_tank/run_models.py
deleted file mode 100644
index 5d612c4ee..000000000
--- a/models/turbine_models/turbine_tank/run_models.py
+++ /dev/null
@@ -1,404 +0,0 @@
-# Copyright 2023 Nod Labs, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import argparse
-from turbine_models.custom_models.sd_inference import (
-    clip,
-    clip_runner,
-    unet,
-    unet_runner,
-    vae,
-    vae_runner,
-)
-
-from turbine_models.custom_models.sd_inference import utils
-import torch
-import os
-import turbine_models.custom_models.stateless_llama as llama
-import difflib
-from turbine_models.turbine_tank import turbine_tank
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--download_ir",
-    action=argparse.BooleanOptionalAction,
-    default=False,
-    help="download IR from turbine tank",
-)
-parser.add_argument(
-    "--upload_ir",
-    action=argparse.BooleanOptionalAction,
-    default=True,
-    help="upload IR to turbine tank",
-)
-
-os.environ["TORCH_LOGS"] = "dynamic"
-from shark_turbine.aot import *
-from turbine_models.custom_models import llm_runner
-
-from turbine_models.gen_external_params.gen_external_params import (
-    gen_external_params,
-)
-
-DEFAULT_PROMPT = """<s>[INST] <<SYS>>
-Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> hi what are you? [/INST]
-"""
-
-
-def check_output_string(reference, output):
-    # Calculate and print diff
-    diff = difflib.unified_diff(
-        reference.splitlines(keepends=True),
-        output.splitlines(keepends=True),
-        fromfile="reference",
-        tofile="output",
-        lineterm="",
-    )
-    return "".join(diff)
-
-
-def run_llama_model(download_ir=False, upload_ir=True):
-    if not download_ir:
-        gen_external_params(
-            hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-            hf_auth_token=None,
-        )
-    llama.export_transformer_model(
-        hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-        hf_auth_token=None,
-        compile_to="vmfb",
-        external_weights="safetensors",
-        # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized
-        quantization="int4",
-        precision="f16",
-        device="llvm-cpu",
-        target_triple="host",
-        download_ir=download_ir,
-        upload_ir=upload_ir,
-    )
-
-    if download_ir:
-        return
-
-    torch_str_cache_path = (
-        f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_f16_int4.txt"
-    )
-    # if cached, just read
-    if os.path.exists(torch_str_cache_path):
-        with open(torch_str_cache_path, "r") as f:
-            torch_str = f.read()
-    else:
-        torch_str = llm_runner.run_torch_llm(
-            "Trelis/Llama-2-7b-chat-hf-function-calling-v2", None, DEFAULT_PROMPT
-        )
-
-        with open(torch_str_cache_path, "w") as f:
-            f.write(torch_str)
-
-    turbine_str = llm_runner.run_llm(
-        "local-task",
-        DEFAULT_PROMPT,
-        "Llama_2_7b_chat_hf_function_calling_v2.vmfb",
-        "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-        None,
-        f"Llama_2_7b_chat_hf_function_calling_v2_f16_int4.safetensors",
-    )
-
-    result = check_output_string(torch_str, turbine_str)
-
-    # clean up
-    os.remove("Llama_2_7b_chat_hf_function_calling_v2_f16_int4.safetensors")
-    os.remove("Llama_2_7b_chat_hf_function_calling_v2.vmfb")
-    os.remove("Llama_2_7b_chat_hf_function_calling_v2.mlir")
-
-    return result
-
-
-arguments = {
-    "hf_auth_token": None,
-    "hf_model_name": "CompVis/stable-diffusion-v1-4",
-    "batch_size": 1,
-    "height": 512,
-    "width": 512,
-    "run_vmfb": True,
-    "compile_to": None,
-    "external_weight_path": "",
-    "vmfb_path": "",
-    "external_weights": None,
-    "device": "local-task",
-    "iree_target_triple": "",
-    "vulkan_max_allocation": "4294967296",
-    "prompt": "a photograph of an astronaut riding a horse",
-    "in_channels": 4,
-}
-
-
-unet_model = unet.UnetModel(
-    # This is a public model, so no auth required
-    "CompVis/stable-diffusion-v1-4",
-    None,
-)
-
-vae_model = vae.VaeModel(
-    # This is a public model, so no auth required
-    "CompVis/stable-diffusion-v1-4",
-    None,
-)
-
-
-def run_clip_model(download_ir=False, upload_ir=True):
-    clip.export_clip_model(
-        # This is a public model, so no auth required
-        "CompVis/stable-diffusion-v1-4",
-        None,
-        "vmfb",
-        "safetensors",
-        "stable_diffusion_v1_4_clip.safetensors",
-        "cpu",
-        download_ir=download_ir,
-        upload_ir=upload_ir,
-    )
-
-    if download_ir:
-        return
-
-    arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors"
-    arguments["vmfb_path"] = "stable_diffusion_v1_4_clip.vmfb"
-    turbine = clip_runner.run_clip(
-        arguments["device"],
-        arguments["prompt"],
-        arguments["vmfb_path"],
-        arguments["hf_model_name"],
-        arguments["hf_auth_token"],
-        arguments["external_weight_path"],
-    )
-    torch_output = clip_runner.run_torch_clip(
-        arguments["hf_model_name"], arguments["hf_auth_token"], arguments["prompt"]
-    )
-    err = utils.largest_error(torch_output, turbine[0])
-    if err < 9e-5:
-        result = "CLIP SUCCESS: " + str(err)
-    else:
-        result = "CLIP FAILURE: " + str(err)
-
-    # clean up
-    os.remove("stable_diffusion_v1_4_clip.safetensors")
-    os.remove("stable_diffusion_v1_4_clip.vmfb")
-    os.remove("stable_diffusion_v1_4_clip.mlir")
-
-    return result
-
-
-def run_unet_model(download_ir=False, upload_ir=True):
-    unet.export_unet_model(
-        unet_model,
-        # This is a public model, so no auth required
-        "CompVis/stable-diffusion-v1-4",
-        arguments["batch_size"],
-        arguments["height"],
-        arguments["width"],
-        None,
-        "vmfb",
-        "safetensors",
-        "stable_diffusion_v1_4_unet.safetensors",
-        "cpu",
-        download_ir=download_ir,
-        upload_ir=upload_ir,
-    )
-
-    if download_ir:
-        return
-
-    arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors"
-    arguments["vmfb_path"] = "stable_diffusion_v1_4_unet.vmfb"
-    sample = torch.rand(
-        arguments["batch_size"],
-        arguments["in_channels"],
-        arguments["height"] // 8,
-        arguments["width"] // 8,
-        dtype=torch.float32,
-    )
-    timestep = torch.zeros(1, dtype=torch.float32)
-    encoder_hidden_states = torch.rand(2, 77, 768, dtype=torch.float32)
-
-    turbine = unet_runner.run_unet(
-        arguments["device"],
-        sample,
-        timestep,
-        encoder_hidden_states,
-        arguments["vmfb_path"],
-        arguments["hf_model_name"],
-        arguments["hf_auth_token"],
-        arguments["external_weight_path"],
-    )
-    torch_output = unet_runner.run_torch_unet(
-        arguments["hf_model_name"],
-        arguments["hf_auth_token"],
-        sample,
-        timestep,
-        encoder_hidden_states,
-    )
-    err = utils.largest_error(torch_output, turbine)
-    if err < 9e-5:
-        result = "UNET SUCCESS: " + str(err)
-    else:
-        result = "UNET FAILURE: " + str(err)
-
-    # clean up
-    os.remove("stable_diffusion_v1_4_unet.safetensors")
-    os.remove("stable_diffusion_v1_4_unet.vmfb")
-    os.remove("stable_diffusion_v1_4_unet.mlir")
-
-    return result
-
-
-def run_vae_decode(download_ir=False, upload_ir=True):
-    vae.export_vae_model(
-        vae_model,
-        # This is a public model, so no auth required
-        "CompVis/stable-diffusion-v1-4",
-        arguments["batch_size"],
-        arguments["height"],
-        arguments["width"],
-        None,
-        "vmfb",
-        "safetensors",
-        "stable_diffusion_v1_4_vae.safetensors",
-        "cpu",
-        variant="decode",
-        download_ir=download_ir,
-        upload_ir=upload_ir,
-    )
-
-    if download_ir:
-        return
-
-    arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
-    arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb"
-    example_input = torch.rand(
-        arguments["batch_size"],
-        4,
-        arguments["height"] // 8,
-        arguments["width"] // 8,
-        dtype=torch.float32,
-    )
-    turbine = vae_runner.run_vae(
-        arguments["device"],
-        example_input,
-        arguments["vmfb_path"],
-        arguments["hf_model_name"],
-        arguments["hf_auth_token"],
-        arguments["external_weight_path"],
-    )
-    torch_output = vae_runner.run_torch_vae(
-        arguments["hf_model_name"],
-        arguments["hf_auth_token"],
-        "decode",
-        example_input,
-    )
-    err = utils.largest_error(torch_output, turbine)
-    if err < 9e-5:
-        result = "VAE DECODE SUCCESS: " + str(err)
-    else:
-        result = "VAE DECODE FAILURE: " + str(err)
-
-    # clean up
-    os.remove("stable_diffusion_v1_4_vae.safetensors")
-    os.remove("stable_diffusion_v1_4_vae.vmfb")
-    os.remove("stable_diffusion_v1_4_vae.mlir")
-
-    return result
-
-
-def run_vae_encode(download_ir=False, upload_ir=True):
-    vae.export_vae_model(
-        vae_model,
-        # This is a public model, so no auth required
-        "CompVis/stable-diffusion-v1-4",
-        arguments["batch_size"],
-        arguments["height"],
-        arguments["width"],
-        None,
-        "vmfb",
-        "safetensors",
-        "stable_diffusion_v1_4_vae.safetensors",
-        "cpu",
-        variant="encode",
-        download_ir=download_ir,
-        upload_ir=upload_ir,
-    )
-
-    if download_ir:
-        return
-
-    arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
-    arguments["vmfb_path"] = "stable_diffusion_v1_4_vae.vmfb"
-    example_input = torch.rand(
-        arguments["batch_size"],
-        3,
-        arguments["height"],
-        arguments["width"],
-        dtype=torch.float32,
-    )
-    turbine = vae_runner.run_vae(
-        arguments["device"],
-        example_input,
-        arguments["vmfb_path"],
-        arguments["hf_model_name"],
-        arguments["hf_auth_token"],
-        arguments["external_weight_path"],
-    )
-    torch_output = vae_runner.run_torch_vae(
-        arguments["hf_model_name"],
-        arguments["hf_auth_token"],
-        "encode",
-        example_input,
-    )
-    err = utils.largest_error(torch_output, turbine)
-    if err < 2e-3:
-        result = "VAE ENCODE SUCCESS: " + str(err)
-    else:
-        result = "VAE ENCODE FAILURE: " + str(err)
-
-    # clean up
-    os.remove("stable_diffusion_v1_4_vae.safetensors")
-    os.remove("stable_diffusion_v1_4_vae.vmfb")
-    os.remove("stable_diffusion_v1_4_vae.mlir")
-
-    return result
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-
-    if args.upload_ir and args.download_ir:
-        raise ValueError("upload_ir and download_ir can't both be true")
-
-    if args.upload_ir:
-        result = "Turbine Tank Results\n"
-        llama_result = run_llama_model(args.download_ir, args.upload_ir)
-        result += llama_result + "\n"
-        clip_result = run_clip_model(args.download_ir, args.upload_ir)
-        result += clip_result + "\n"
-        unet_result = run_unet_model(args.download_ir, args.upload_ir)
-        result += unet_result + "\n"
-        vae_decode_result = run_vae_decode(args.download_ir, args.upload_ir)
-        result += vae_decode_result + "\n"
-        vae_encode_result = run_vae_encode(args.download_ir, args.upload_ir)
-        result += vae_encode_result + "\n"
-        f = open("daily_report.txt", "a")
-        f.write(result)
-        f.close()
-        turbine_tank.uploadToBlobStorage(
-            str(os.path.abspath("daily_report.txt")), "daily_report.txt"
-        )
-        os.remove("daily_report.txt")
-    else:
-        run_llama_model(args.download_ir, args.upload_ir)
-        run_clip_model(args.download_ir, args.upload_ir)
-        run_unet_model(args.download_ir, args.upload_ir)
-        run_vae_decode(args.download_ir, args.upload_ir)
-        run_vae_encode(args.download_ir, args.upload_ir)
diff --git a/models/turbine_models/turbine_tank/run_tank.py b/models/turbine_models/turbine_tank/run_tank.py
new file mode 100644
index 000000000..816d92322
--- /dev/null
+++ b/models/turbine_models/turbine_tank/run_tank.py
@@ -0,0 +1,61 @@
+# Copyright 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+import unittest
+from turbine_models.turbine_tank import tank_util
+
+import turbine_models.tests.sd_test as sd_test
+import os
+from turbine_models.turbine_tank import turbine_tank
+
+import pytest
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--download_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="download IR from turbine tank",
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.download_ir:
+        turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-clip")
+        turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-vae-decode")
+        turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-vae-encode")
+        turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-unet")
+        turbine_tank.downloadModelArtifacts(
+            "Trelis/Llama-2-7b-chat-hf-function-calling-v2"
+        )
+        for model_name, _ in tank_util.model_list:
+            turbine_tank.downloadModelArtifacts(model_name)
+    else:
+        import turbine_models.tests.stateless_llama_test as stateless_llama_test
+
+        # environment variable used to let the llama/sd tests know we are running from tank and want to upload
+        os.environ["TURBINE_TANK_ACTION"] = "upload"
+
+        # run existing turbine llama and sd tests integrated with turbine tank
+        llama_suite = unittest.TestLoader().loadTestsFromModule(stateless_llama_test)
+        unittest.TextTestRunner(verbosity=2).run(llama_suite)
+
+        sd_suite = unittest.TestLoader().loadTestsFromModule(sd_test)
+        unittest.TextTestRunner(verbosity=2).run(sd_suite)
+
+        # cleanup
+        os.remove("Llama_2_7b_chat_hf_function_calling_v2_f32_unquantized.safetensors")
+        os.remove("Llama_2_7b_chat_hf_function_calling_v2.mlir")
+        os.remove("Llama_2_7b_chat_hf_function_calling_v2.vmfb")
+        os.remove("streaming_llama.vmfb")
+        os.remove("stable_diffusion_v1_4_clip.mlir")
+        os.remove("stable_diffusion_v1_4_unet.mlir")
+        os.remove("stable_diffusion_v1_4_vae.mlir")
+
+        # runs tank_test.py (only pytest file in this directory, runs 30 models e2e)
+        pytest.main(["-v", os.path.dirname(os.path.abspath(__file__))])
diff --git a/models/turbine_models/turbine_tank/tank_test.py b/models/turbine_models/turbine_tank/tank_test.py
new file mode 100644
index 000000000..17c8ec8ba
--- /dev/null
+++ b/models/turbine_models/turbine_tank/tank_test.py
@@ -0,0 +1,143 @@
+import pytest
+from turbine_models.turbine_tank import tank_util
+from turbine_models.model_builder import HFTransformerBuilder
+from turbine_models.model_runner import vmfbRunner
+from turbine_models.custom_models.sd_inference import utils
+from iree import runtime as ireert
+import os
+
+
+@pytest.mark.parametrize(
+    "model_name,model_type,expected_err,run_e2e",
+    [
+        ("microsoft/resnet-50", "hf_img_cls", 8e-05, True),
+        ("bert-large-uncased", "hf", 8e-06, True),
+        ("facebook/deit-small-distilled-patch16-224", "hf_img_cls", 8e-05, True),
+        ("google/vit-base-patch16-224", "hf_img_cls", 8e-05, True),
+        ("microsoft/beit-base-patch16-224-pt22k-ft22k", "hf_img_cls", 8e-05, True),
+        ("microsoft/MiniLM-L12-H384-uncased", "hf", 5e-07, True),
+        ("google/mobilebert-uncased", "hf", 4.3, True),
+        ("mobilenet_v3_small", "vision", 6e-05, True),
+        ("nvidia/mit-b0", "hf_img_cls", 7.3, True),
+        ("resnet101", "vision", 8e-06, True),
+        ("resnet18", "vision", 8e-06, True),
+        ("resnet50", "vision", 8e-06, True),
+        ("squeezenet1_0", "vision", 9e-06, True),
+        ("wide_resnet50_2", "vision", 9e-06, True),
+        ("mnasnet1_0", "vision", 2e-05, True),
+        pytest.param(
+            "t5-base",
+            "hf_seq2seq",
+            -1,
+            False,
+            marks=pytest.mark.xfail(reason="iree-compile fails"),
+        ),
+        pytest.param(
+            "t5-large",
+            "hf_seq2seq",
+            -1,
+            False,
+            marks=pytest.mark.xfail(reason="iree-compile fails"),
+        ),
+        ("openai/whisper-base", "hf_causallm", 9e-05, True),
+        ("openai/whisper-small", "hf_causallm", 0.0003, True),
+        ("openai/whisper-medium", "hf_causallm", 0.0003, True),
+        ("facebook/opt-350m", "hf", 9e-07, True),
+        ("facebook/opt-1.3b", "hf", 9e-06, True),
+        ("BAAI/bge-base-en-v1.5", "hf", 9e-07, True),
+        pytest.param(
+            "facebook/bart-large",
+            "hf_seq2seq",
+            -1,
+            False,
+            marks=pytest.mark.xfail(reason="iree-compile fails"),
+        ),
+        pytest.param(
+            "gpt2",
+            "hf",
+            -1,
+            False,
+            marks=pytest.mark.xfail(reason="iree-compile fails"),
+        ),
+        pytest.param(
+            "gpt2-xl",
+            "hf",
+            -1,
+            False,
+            marks=pytest.mark.xfail(reason="iree-compile fails"),
+        ),
+        ("lmsys/vicuna-13b-v1.3", "hf", 5e-05, True),
+        pytest.param(
+            "microsoft/phi-1_5",
+            "hf_causallm",
+            -1,
+            True,
+            marks=pytest.mark.xfail(reason="correctness issue"),
+        ),  # nan error reported (correctness issue)
+        pytest.param(
+            "microsoft/phi-2",
+            "hf_causallm",
+            -1,
+            True,
+            marks=pytest.mark.xfail(reason="correctness issue"),
+        ),  # nan error reported (correctness issue)
+        pytest.param(
+            "mosaicml/mpt-30b",
+            "hf_causallm",
+            -1,
+            False,
+            marks=pytest.mark.xfail(reason="iree-compile fails"),
+        ),
+        ("stabilityai/stablelm-3b-4e1t", "hf_causallm", 0.0004, True),
+    ],
+)
+def test_all_models(model_name, model_type, expected_err, run_e2e):
+    import_args = {
+        "batch_size": 1,
+    }
+    # Based on the model type, get the appropriate hugging face model, inputs, and output
+    if model_type == "vision":
+        torch_model, input, out = tank_util.get_vision_model(model_name, import_args)
+    elif model_type == "hf":
+        torch_model, input, out = tank_util.get_hf_model(model_name, import_args)
+    elif model_type == "hf_seq2seq":
+        torch_model, input, out = tank_util.get_hf_seq2seq_model(
+            model_name, import_args
+        )
+    elif model_type == "hf_causallm":
+        torch_model, input, out = tank_util.get_hf_causallm_model(
+            model_name, import_args
+        )
+    elif model_type == "hf_img_cls":
+        torch_model, input, out = tank_util.get_hf_img_cls_model(
+            model_name, import_args
+        )
+    # compile model and get vmfb
+    model = HFTransformerBuilder(
+        example_input=input,
+        hf_id=model_name,
+        hf_auth_token="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",
+        upload_ir=True,
+        model=torch_model,
+        model_type=model_type,
+        run_e2e=run_e2e,
+    )
+    vmfb_name = model_name.replace("/", "_") + ".vmfb"
+    model.get_compiled_module(save_to=vmfb_name)
+
+    # if model is not supposed to run e2e, exit at this point (mlir has been uploaded)
+    if run_e2e is False:
+        assert expected_err > 0
+        return
+
+    # run inference using iree runtime
+    runner = vmfbRunner("local-task", vmfb_name)
+    inputs = [ireert.asdevicearray(runner.config.device, input)]
+    keys = list(runner.ctx.modules)
+    key = keys[len(keys) - 1]
+    results = runner.ctx.modules.__getattr__(key)["main"](*inputs)
+    err = utils.largest_error(out.cpu().detach().numpy(), results)
+    # cleanup
+    os.remove(vmfb_name)
+    # accuracy
+    assert err < expected_err
diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py
new file mode 100644
index 000000000..4832e9e24
--- /dev/null
+++ b/models/turbine_models/turbine_tank/tank_util.py
@@ -0,0 +1,260 @@
+import torch
+import numpy as np
+
+torch.manual_seed(0)
+
+BATCH_SIZE = 1
+
+model_list = [
+    ("microsoft/resnet-50", "hf_img_cls"),
+    ("bert-large-uncased", "hf"),
+    ("facebook/deit-small-distilled-patch16-224", "hf_img_cls"),
+    ("google/vit-base-patch16-224", "hf_img_cls"),
+    ("microsoft/beit-base-patch16-224-pt22k-ft22k", "hf_img_cls"),
+    ("microsoft/MiniLM-L12-H384-uncased", "hf"),
+    ("google/mobilebert-uncased", "hf"),
+    ("mobilenet_v3_small", "vision"),
+    ("nvidia/mit-b0", "hf_img_cls"),
+    ("resnet101", "vision"),
+    ("resnet18", "vision"),
+    ("resnet50", "vision"),
+    ("squeezenet1_0", "vision"),
+    ("wide_resnet50_2", "vision"),
+    ("mnasnet1_0", "vision"),
+    ("t5-base", "hf_seq2seq"),  # iree-compile failure
+    ("t5-large", "hf_seq2seq"),  # iree-compile failure
+    ("openai/whisper-base", "hf_causallm"),
+    ("openai/whisper-small", "hf_causallm"),
+    ("openai/whisper-medium", "hf_causallm"),
+    ("facebook/opt-350m", "hf"),
+    ("facebook/opt-1.3b", "hf"),
+    ("BAAI/bge-base-en-v1.5", "hf"),
+    ("facebook/bart-large", "hf_seq2seq"),  # iree-compile fails
+    ("gpt2", "hf"),  # iree-compile fails
+    ("gpt2-xl", "hf"),  # iree-compile fails
+    ("lmsys/vicuna-13b-v1.3", "hf"),
+    ("microsoft/phi-1_5", "hf_causallm"),  # nan error reported (correctness issue)
+    ("microsoft/phi-2", "hf_causallm"),  # nan error reported (correctness issue)
+    ("mosaicml/mpt-30b", "hf_causallm"),  # iree-compile fails
+    ("stabilityai/stablelm-3b-4e1t", "hf_causallm"),
+]
+
+
+##################### Hugging Face Image Classification Models ###################################
+from transformers import AutoModelForImageClassification
+from transformers import AutoFeatureExtractor
+from PIL import Image
+import requests
+
+
+def preprocess_input_image(model_name):
+    # from datasets import load_dataset
+    # dataset = load_dataset("huggingface/cats-image")
+    # image1 = dataset["test"]["image"][0]
+    # # print("image1: ", image1) # <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7FA0B86BB6D0>
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    # <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7FA0B86BB6D0>
+    image = Image.open(requests.get(url, stream=True).raw)
+    # feature_extractor = img_models_fe_dict[model_name].from_pretrained(
+    #     model_name
+    # )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
+    inputs = feature_extractor(images=image, return_tensors="pt")
+    # inputs = {'pixel_values': tensor([[[[ 0.1137..., -0.2000, -0.4275, -0.5294]]]])}
+    #           torch.Size([1, 3, 224, 224]), torch.FloatTensor
+
+    return inputs[str(*inputs)]
+
+
+class HuggingFaceImageClassification(torch.nn.Module):
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.model = AutoModelForImageClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            return_dict=False,  # https://github.com/huggingface/transformers/issues/9095
+            torchscript=True,
+        )
+
+    def forward(self, inputs):
+        return self.model.forward(inputs)[0]
+
+
+def get_hf_img_cls_model(name, import_args):
+    model = HuggingFaceImageClassification(name)
+    # you can use preprocess_input_image to get the test_input or just random value.
+    test_input = preprocess_input_image(name)
+    # test_input = torch.FloatTensor(1, 3, 224, 224).uniform_(-1, 1)
+    # print("test_input.shape: ", test_input.shape)
+    # test_input.shape:  torch.Size([1, 3, 224, 224])
+    test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1)
+    actual_out = model(test_input)
+    # actual_out.shape：  torch.Size([1, 1000])
+    return model, test_input, actual_out
+
+
+##################### Hugging Face LM Models ###################################
+
+
+class HuggingFaceLanguage(torch.nn.Module):
+    def __init__(self, hf_model_name):
+        super().__init__()
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
+        import transformers as trf
+
+        transformers_path = trf.__path__[0]
+        hf_model_path = f"{transformers_path}/models/{hf_model_name}"
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            hf_model_name,  # The pretrained model.
+            num_labels=2,  # The number of output labels--2 for binary classification.
+            output_attentions=False,  # Whether the model returns attentions weights.
+            output_hidden_states=False,  # Whether the model returns all hidden-states.
+            torchscript=True,
+        )
+        self.model.config.pad_token_id = None
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+def get_hf_model(name, import_args):
+    from transformers import (
+        BertTokenizer,
+    )
+
+    model = HuggingFaceLanguage(name)
+    test_input = torch.randint(2, (int(import_args["batch_size"]), 128))
+    actual_out = model(test_input)
+    return model, test_input, actual_out
+
+
+##################### Hugging Face Seq2SeqLM Models ###################################
+
+# We use a maximum sequence length of 512 since this is the default used in the T5 config.
+T5_MAX_SEQUENCE_LENGTH = 512
+
+
+class HFSeq2SeqLanguageModel(torch.nn.Module):
+    def __init__(self, model_name):
+        super().__init__()
+        from transformers import AutoTokenizer, T5Model
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.tokenization_kwargs = {
+            "pad_to_multiple_of": T5_MAX_SEQUENCE_LENGTH,
+            "padding": True,
+            "return_tensors": "pt",
+        }
+        self.model = T5Model.from_pretrained(model_name, return_dict=True)
+
+    def preprocess_input(self, text):
+        return self.tokenizer(text, **self.tokenization_kwargs)
+
+    def forward(self, input_ids, decoder_input_ids):
+        return self.model.forward(input_ids, decoder_input_ids=decoder_input_ids)[0]
+
+
+def get_hf_seq2seq_model(name, import_args):
+    m = HFSeq2SeqLanguageModel(name)
+    encoded_input_ids = m.preprocess_input(
+        "Studies have been shown that owning a dog is good for you"
+    ).input_ids
+    decoder_input_ids = m.preprocess_input("Studies show that").input_ids
+    decoder_input_ids = m.model._shift_right(decoder_input_ids)
+
+    test_input = (encoded_input_ids, decoder_input_ids)
+    actual_out = m.forward(*test_input)
+    return m, test_input, actual_out
+
+
+##################### Hugging Face CausalLM Models ###################################
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+def prepare_sentence_tokens(hf_model: str, sentence: str):
+    tokenizer = AutoTokenizer.from_pretrained(hf_model)
+    return torch.tensor([tokenizer.encode(sentence)])
+
+
+class HFCausalLM(torch.nn.Module):
+    def __init__(self, model_name: str):
+        super().__init__()
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,  # The pretrained model name.
+            # The number of output labels--2 for binary classification.
+            num_labels=2,
+            # Whether the model returns attentions weights.
+            output_attentions=False,
+            # Whether the model returns all hidden-states.
+            output_hidden_states=False,
+            torchscript=True,
+            trust_remote_code=True,
+        )
+        self.model.eval()
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+def get_hf_causallm_model(name, import_args):
+    m = HFCausalLM(name)
+    test_input = prepare_sentence_tokens(name, "this project is very interesting")
+    actual_out = m.forward(test_input)
+    return m, test_input, actual_out
+
+
+################################################################################
+
+##################### Torch Vision Models    ###################################
+
+
+class VisionModule(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.train(False)
+
+    def forward(self, input):
+        return self.model.forward(input)
+
+
+def get_vision_model(torch_model, import_args):
+    import torchvision.models as models
+
+    default_image_size = (224, 224)
+    modelname = torch_model
+    if modelname == "alexnet":
+        torch_model = models.alexnet(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet18":
+        torch_model = models.resnet18(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet50":
+        torch_model = models.resnet50(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet50_fp16":
+        torch_model = models.resnet50(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet50_fp16":
+        torch_model = models.resnet50(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet101":
+        torch_model = models.resnet101(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "squeezenet1_0":
+        torch_model = models.squeezenet1_0(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "wide_resnet50_2":
+        torch_model = models.wide_resnet50_2(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "mobilenet_v3_small":
+        torch_model = models.mobilenet_v3_small(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "mnasnet1_0":
+        torch_model = models.mnasnet1_0(weights="DEFAULT")
+        input_image_size = default_image_size
+
+    model = VisionModule(torch_model)
+    test_input = torch.randn(int(import_args["batch_size"]), 3, *input_image_size)
+    actual_out = model(test_input)
+    return model, test_input, actual_out
diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py
index 3d06d8009..708218c75 100644
--- a/models/turbine_models/turbine_tank/turbine_tank.py
+++ b/models/turbine_models/turbine_tank/turbine_tank.py
@@ -90,10 +90,12 @@ def download_public_folder(model_name: str, prefix: str, model_dir: str):
         container=container_name
     )
     blob_list = container_client.list_blobs(name_starts_with=prefix)
+    empty = True
 
     # go through the blobs with our target prefix
     # example prefix: "2024-02-13_26d6428/CompVis_stable-diffusion-v1-4-clip"
     for blob in blob_list:
+        empty = False
         blob_client = blob_service_client.get_blob_client(
             container=container_name, blob=blob.name
         )
@@ -108,6 +110,12 @@ def download_public_folder(model_name: str, prefix: str, model_dir: str):
             download_stream = blob_client.download_blob()
             sample_blob.write(download_stream.readall())
 
+    if empty:
+        print(f"Model ({model_name}) has not been uploaded yet")
+        return True
+
+    return False
+
 
 # sort blobs by last modified
 def compare(item1, item2):
@@ -140,11 +148,13 @@ def downloadModelArtifacts(model_name: str) -> str:
         print("Already downloaded most recent version")
         return "NA"
     # download the model artifacts (passing in the model name, path in azure storage to model artifacts, local directory to store)
-    download_public_folder(
+    blobDNE = download_public_folder(
         model_name,
         download_latest_prefix + "/" + model_name,
         os.path.join(model_dir, download_latest_prefix),
     )
+    if blobDNE:
+        return
     model_dir = os.path.join(WORKDIR, model_name + "/" + download_latest_prefix)
     mlir_filename = os.path.join(model_dir, model_name + ".mlir")
     print(

From b1ad5726f8df9094dc487309b1d8186f2872fcd0 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 21 Feb 2024 02:10:49 -0800
Subject: [PATCH 05/20] formatting

---
 models/turbine_models/custom_models/sd_inference/unet.py | 1 -
 models/turbine_models/custom_models/sd_inference/vae.py  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index f65af556c..829a8c0bc 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -99,7 +99,6 @@ def export_unet_model(
     max_alloc=None,
     upload_ir=False,
 ):
-
     mapper = {}
     utils.save_external_weights(
         mapper, unet_model, external_weights, external_weight_path
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 5d62edf1f..885ac6e60 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -98,7 +98,6 @@ def export_vae_model(
     variant="decode",
     upload_ir=False,
 ):
-
     mapper = {}
     utils.save_external_weights(
         mapper, vae_model, external_weights, external_weight_path

From a465a483e2b0888c7065498d7800abe25ebd021f Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 21 Feb 2024 02:56:02 -0800
Subject: [PATCH 06/20] remove unnecessary upload_ir var pass in utils

---
 models/turbine_models/custom_models/sd_inference/clip.py  | 4 +---
 models/turbine_models/custom_models/sd_inference/unet.py  | 4 +---
 models/turbine_models/custom_models/sd_inference/utils.py | 4 +---
 models/turbine_models/custom_models/sd_inference/vae.py   | 4 +---
 4 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index d7ed96561..2c09b13fb 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -115,9 +115,7 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
     if compile_to != "vmfb":
         return module_str, tokenizer
     else:
-        utils.compile_to_vmfb(
-            module_str, device, target_triple, max_alloc, safe_name, upload_ir
-        )
+        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 829a8c0bc..3a42dd918 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -145,9 +145,7 @@ def main(
     if compile_to != "vmfb":
         return module_str
     else:
-        utils.compile_to_vmfb(
-            module_str, device, target_triple, max_alloc, safe_name, upload_ir
-        )
+        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/sd_inference/utils.py b/models/turbine_models/custom_models/sd_inference/utils.py
index 3d5d2a0a2..37787fd3a 100644
--- a/models/turbine_models/custom_models/sd_inference/utils.py
+++ b/models/turbine_models/custom_models/sd_inference/utils.py
@@ -26,9 +26,7 @@ def largest_error(array1, array2):
     return max_error
 
 
-def compile_to_vmfb(
-    module_str, device, target_triple, max_alloc, safe_name, upload_ir=False
-):
+def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name):
     flags = [
         "--iree-input-type=torch",
         "--mlir-print-debuginfo",
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 885ac6e60..8ba0fb6bb 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -133,9 +133,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
     if compile_to != "vmfb":
         return module_str
     else:
-        utils.compile_to_vmfb(
-            module_str, device, target_triple, max_alloc, safe_name, upload_ir
-        )
+        utils.compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name)
 
 
 if __name__ == "__main__":

From ac8997658becb8ade883938d73ce3ef34600b108 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 21 Feb 2024 12:31:04 -0800
Subject: [PATCH 07/20] address comments

---
 models/turbine_models/custom_models/sd_inference/clip.py | 7 -------
 models/turbine_models/custom_models/sd_inference/unet.py | 7 -------
 models/turbine_models/custom_models/sd_inference/vae.py  | 7 -------
 models/turbine_models/custom_models/stateless_llama.py   | 7 -------
 models/turbine_models/turbine_tank/turbine_tank.py       | 9 ++++-----
 5 files changed, 4 insertions(+), 33 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index 2c09b13fb..4cc5f91dd 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -47,12 +47,6 @@
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
-parser.add_argument(
-    "--upload_ir",
-    action=argparse.BooleanOptionalAction,
-    default=False,
-    help="upload IR to turbine tank",
-)
 
 
 def export_clip_model(
@@ -129,7 +123,6 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
         args.device,
         args.iree_target_triple,
         args.vulkan_max_allocation,
-        args.upload_ir,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 3a42dd918..2c1556e84 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -54,12 +54,6 @@
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
-parser.add_argument(
-    "--upload_ir",
-    action=argparse.BooleanOptionalAction,
-    default=False,
-    help="upload IR to turbine tank",
-)
 
 
 class UnetModel(torch.nn.Module):
@@ -167,7 +161,6 @@ def main(
         args.device,
         args.iree_target_triple,
         args.vulkan_max_allocation,
-        args.upload_ir,
     )
     safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
     with open(f"{safe_name}.mlir", "w+") as f:
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index 8ba0fb6bb..fcf9453b4 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -55,12 +55,6 @@
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 parser.add_argument("--variant", type=str, default="decode")
-parser.add_argument(
-    "--upload_ir",
-    action=argparse.BooleanOptionalAction,
-    default=False,
-    help="upload IR to turbine tank",
-)
 
 
 class VaeModel(torch.nn.Module):
@@ -156,7 +150,6 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
         args.iree_target_triple,
         args.vulkan_max_allocation,
         args.variant,
-        args.upload_ir,
     )
     safe_name = utils.create_safe_name(args.hf_model_name, "-vae")
     with open(f"{safe_name}.mlir", "w+") as f:
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
index 3fa19c12e..6863fd5c2 100644
--- a/models/turbine_models/custom_models/stateless_llama.py
+++ b/models/turbine_models/custom_models/stateless_llama.py
@@ -62,12 +62,6 @@
     action="store_true",
     help="Compile LLM with StreamingLLM optimizations",
 )
-parser.add_argument(
-    "--upload_ir",
-    action=argparse.BooleanOptionalAction,
-    default=False,
-    help="upload IR to turbine tank",
-)
 
 
 def generate_schema(num_layers):
@@ -413,7 +407,6 @@ def evict_kvcache_space(self):
         args.vulkan_max_allocation,
         args.streaming_llm,
         args.vmfb_path,
-        args.upload_ir,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)
diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py
index 708218c75..e5947cd58 100644
--- a/models/turbine_models/turbine_tank/turbine_tank.py
+++ b/models/turbine_models/turbine_tank/turbine_tank.py
@@ -19,12 +19,11 @@
     print(
         f"turbine_tank local cache is located at {WORKDIR} . You may change this by assigning the TURBINE_TANK_CACHE_DIR environment variable."
     )
-os.makedirs(WORKDIR, exist_ok=True)
 
-storage_account_key = "XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A=="
-storage_account_name = "tankturbine"
-connection_string = "DefaultEndpointsProtocol=https;AccountName=tankturbine;AccountKey=XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==;EndpointSuffix=core.windows.net"
-container_name = "tankturbine"
+storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY")
+storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME")
+connection_string = os.environ.get("AZURE_CONNECTION_STRING")
+container_name = os.environ.get("AZURE_CONTAINER_NAME")
 
 
 def get_short_git_sha() -> str:

From cde33d945a216b89e5df358cb13677239ea97383 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 21 Feb 2024 12:32:04 -0800
Subject: [PATCH 08/20] add line back

---
 models/turbine_models/turbine_tank/turbine_tank.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py
index e5947cd58..8150cc157 100644
--- a/models/turbine_models/turbine_tank/turbine_tank.py
+++ b/models/turbine_models/turbine_tank/turbine_tank.py
@@ -19,6 +19,7 @@
     print(
         f"turbine_tank local cache is located at {WORKDIR} . You may change this by assigning the TURBINE_TANK_CACHE_DIR environment variable."
     )
+os.makedirs(WORKDIR, exist_ok=True)
 
 storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY")
 storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME")

From 385d910bd98e9242070a6bb26735ec538009cdb0 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 21 Feb 2024 13:37:31 -0800
Subject: [PATCH 09/20] back to hardcoded credentials

---
 models/turbine_models/turbine_tank/turbine_tank.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py
index 8150cc157..708218c75 100644
--- a/models/turbine_models/turbine_tank/turbine_tank.py
+++ b/models/turbine_models/turbine_tank/turbine_tank.py
@@ -21,10 +21,10 @@
     )
 os.makedirs(WORKDIR, exist_ok=True)
 
-storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY")
-storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME")
-connection_string = os.environ.get("AZURE_CONNECTION_STRING")
-container_name = os.environ.get("AZURE_CONTAINER_NAME")
+storage_account_key = "XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A=="
+storage_account_name = "tankturbine"
+connection_string = "DefaultEndpointsProtocol=https;AccountName=tankturbine;AccountKey=XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==;EndpointSuffix=core.windows.net"
+container_name = "tankturbine"
 
 
 def get_short_git_sha() -> str:

From 07bcc6ff4272c18e216b75f082a848c6246b57f7 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 21 Feb 2024 15:48:39 -0800
Subject: [PATCH 10/20] update to env vars

---
 models/turbine_models/turbine_tank/turbine_tank.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py
index 708218c75..8150cc157 100644
--- a/models/turbine_models/turbine_tank/turbine_tank.py
+++ b/models/turbine_models/turbine_tank/turbine_tank.py
@@ -21,10 +21,10 @@
     )
 os.makedirs(WORKDIR, exist_ok=True)
 
-storage_account_key = "XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A=="
-storage_account_name = "tankturbine"
-connection_string = "DefaultEndpointsProtocol=https;AccountName=tankturbine;AccountKey=XSsr+KqxBLxXzRtFv3QbbdsAxdwDGe661Q1xY4ziMRtpCazN8W6HZePi6nwud5RNLC5Y7e410abg+AStyzmX1A==;EndpointSuffix=core.windows.net"
-container_name = "tankturbine"
+storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY")
+storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME")
+connection_string = os.environ.get("AZURE_CONNECTION_STRING")
+container_name = os.environ.get("AZURE_CONTAINER_NAME")
 
 
 def get_short_git_sha() -> str:

From d14990169df5b0e15283e609e4a7daecca0630b6 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Fri, 23 Feb 2024 17:43:58 -0800
Subject: [PATCH 11/20] add support for external param flow

---
 models/turbine_models/model_builder.py        |  52 +++---
 models/turbine_models/tests/sd_test.py        |   6 +-
 .../turbine_models/turbine_tank/run_tank.py   |   2 +-
 .../turbine_models/turbine_tank/tank_test.py  |  39 ++--
 .../turbine_models/turbine_tank/tank_util.py  | 174 +++++++++++++++++-
 .../turbine_tank/turbine_tank.py              |  22 ++-
 6 files changed, 233 insertions(+), 62 deletions(-)

diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py
index 6f5c8b578..2577a39b5 100644
--- a/models/turbine_models/model_builder.py
+++ b/models/turbine_models/model_builder.py
@@ -21,7 +21,7 @@ class HFTransformerBuilder:
     def __init__(
         self,
         example_input: torch.Tensor,
-        hf_id: str,
+        hf_id: str = None,
         auto_model: AutoModel = AutoModel,
         auto_tokenizer: AutoTokenizer = None,
         auto_config: AutoConfig = None,
@@ -50,15 +50,16 @@ def build_model(self) -> None:
         Builds a PyTorch model using Hugging Face's transformers library.
         """
         # TODO: check cloud storage for existing ir
-        self.model = self.auto_model.from_pretrained(
-            self.hf_id, token=self.hf_auth_token, config=self.auto_config
-        )
-        if self.auto_tokenizer is not None:
-            self.tokenizer = self.auto_tokenizer.from_pretrained(
-                self.hf_id, token=self.hf_auth_token
+        if self.hf_id:
+            self.model = self.auto_model.from_pretrained(
+                self.hf_id, token=self.hf_auth_token, config=self.auto_config
             )
-        else:
-            self.tokenizer = None
+            if self.auto_tokenizer is not None:
+                self.tokenizer = self.auto_tokenizer.from_pretrained(
+                    self.hf_id, token=self.hf_auth_token
+                )
+            else:
+                self.tokenizer = None
 
     def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule:
         """
@@ -74,19 +75,20 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule:
             module = aot.export(self.model, *self.example_input)
         else:
             module = aot.export(self.model, self.example_input)
-        module_str = str(module.mlir_module)
-        safe_name = self.hf_id.split("/")[-1].strip()
-        safe_name = re.sub("-", "_", safe_name)
-        if self.upload_ir:
-            with open(f"{safe_name}.mlir", "w+") as f:
-                f.write(module_str)
-            model_name_upload = self.hf_id.replace("/", "_")
-            turbine_tank.uploadToBlobStorage(
-                str(os.path.abspath(f"{safe_name}.mlir")),
-                f"{model_name_upload}/{model_name_upload}.mlir",
-            )
-            os.remove(f"{safe_name}.mlir")
-        if self.run_e2e is not None and self.run_e2e is False:
-            return
-        compiled_binary = module.compile(save_to=save_to)
-        return compiled_binary
+        if self.hf_id:
+            module_str = str(module.mlir_module)
+            safe_name = self.hf_id.split("/")[-1].strip()
+            safe_name = re.sub("-", "_", safe_name)
+            if self.upload_ir:
+                with open(f"{safe_name}.mlir", "w+") as f:
+                    f.write(module_str)
+                model_name_upload = self.hf_id.replace("/", "_")
+                turbine_tank.uploadToBlobStorage(
+                    str(os.path.abspath(f"{safe_name}.mlir")),
+                    f"{model_name_upload}/{model_name_upload}.mlir",
+                )
+                os.remove(f"{safe_name}.mlir")
+            if self.run_e2e is not None and self.run_e2e is False:
+                return
+            compiled_binary = module.compile(save_to=save_to)
+            return compiled_binary
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index 01887db72..f555f5fdd 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -262,9 +262,9 @@ def testExportPNDMScheduler(self):
                 "cpu",
             )
         self.assertEqual(cm.exception.code, None)
-        arguments[
-            "external_weight_path"
-        ] = "stable_diffusion_v1_4_scheduler.safetensors"
+        arguments["external_weight_path"] = (
+            "stable_diffusion_v1_4_scheduler.safetensors"
+        )
         arguments["vmfb_path"] = "stable_diffusion_v1_4_scheduler.vmfb"
         sample = torch.rand(
             arguments["batch_size"],
diff --git a/models/turbine_models/turbine_tank/run_tank.py b/models/turbine_models/turbine_tank/run_tank.py
index 816d92322..a4e77db07 100644
--- a/models/turbine_models/turbine_tank/run_tank.py
+++ b/models/turbine_models/turbine_tank/run_tank.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Nod Labs, Inc
+# Copyright 2024 Advanced Micro Devices, Inc
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/models/turbine_models/turbine_tank/tank_test.py b/models/turbine_models/turbine_tank/tank_test.py
index 17c8ec8ba..d1f825e46 100644
--- a/models/turbine_models/turbine_tank/tank_test.py
+++ b/models/turbine_models/turbine_tank/tank_test.py
@@ -1,10 +1,12 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 import pytest
 from turbine_models.turbine_tank import tank_util
 from turbine_models.model_builder import HFTransformerBuilder
-from turbine_models.model_runner import vmfbRunner
-from turbine_models.custom_models.sd_inference import utils
-from iree import runtime as ireert
-import os
 
 
 @pytest.mark.parametrize(
@@ -95,6 +97,7 @@ def test_all_models(model_name, model_type, expected_err, run_e2e):
     import_args = {
         "batch_size": 1,
     }
+
     # Based on the model type, get the appropriate hugging face model, inputs, and output
     if model_type == "vision":
         torch_model, input, out = tank_util.get_vision_model(model_name, import_args)
@@ -112,32 +115,20 @@ def test_all_models(model_name, model_type, expected_err, run_e2e):
         torch_model, input, out = tank_util.get_hf_img_cls_model(
             model_name, import_args
         )
-    # compile model and get vmfb
+
+    # create hugging face transformer model
     model = HFTransformerBuilder(
         example_input=input,
         hf_id=model_name,
-        hf_auth_token="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",
         upload_ir=True,
         model=torch_model,
         model_type=model_type,
         run_e2e=run_e2e,
     )
-    vmfb_name = model_name.replace("/", "_") + ".vmfb"
-    model.get_compiled_module(save_to=vmfb_name)
 
-    # if model is not supposed to run e2e, exit at this point (mlir has been uploaded)
-    if run_e2e is False:
-        assert expected_err > 0
-        return
-
-    # run inference using iree runtime
-    runner = vmfbRunner("local-task", vmfb_name)
-    inputs = [ireert.asdevicearray(runner.config.device, input)]
-    keys = list(runner.ctx.modules)
-    key = keys[len(keys) - 1]
-    results = runner.ctx.modules.__getattr__(key)["main"](*inputs)
-    err = utils.largest_error(out.cpu().detach().numpy(), results)
-    # cleanup
-    os.remove(vmfb_name)
-    # accuracy
-    assert err < expected_err
+    # runs using external params
+    tank_util.param_flow(
+        model, model_name, model_type, input, out, run_e2e, expected_err
+    )
+    # inline weights
+    tank_util.classic_flow(model, model_name, input, out, run_e2e, expected_err)
diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py
index 4832e9e24..718caf9d7 100644
--- a/models/turbine_models/turbine_tank/tank_util.py
+++ b/models/turbine_models/turbine_tank/tank_util.py
@@ -1,5 +1,20 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import torch
+import iree.compiler as ireec
 import torch
-import numpy as np
+from turbine_models.turbine_tank import tank_util
+from turbine_models.model_runner import vmfbRunner
+from turbine_models.custom_models.sd_inference import utils
+from iree import runtime as ireert
+import os
+from shark_turbine.aot import *
+from iree.compiler.ir import Context
+from turbine_models.turbine_tank import turbine_tank
 
 torch.manual_seed(0)
 
@@ -88,6 +103,7 @@ def get_hf_img_cls_model(name, import_args):
     # print("test_input.shape: ", test_input.shape)
     # test_input.shape:  torch.Size([1, 3, 224, 224])
     test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1)
+    print(f"YOOO TEST INPUT: {test_input.shape}")
     actual_out = model(test_input)
     # actual_out.shape：  torch.Size([1, 1000])
     return model, test_input, actual_out
@@ -118,9 +134,6 @@ def forward(self, tokens):
 
 
 def get_hf_model(name, import_args):
-    from transformers import (
-        BertTokenizer,
-    )
 
     model = HuggingFaceLanguage(name)
     test_input = torch.randint(2, (int(import_args["batch_size"]), 128))
@@ -172,7 +185,9 @@ def get_hf_seq2seq_model(name, import_args):
 
 
 def prepare_sentence_tokens(hf_model: str, sentence: str):
-    tokenizer = AutoTokenizer.from_pretrained(hf_model)
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_model, token="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"
+    )
     return torch.tensor([tokenizer.encode(sentence)])
 
 
@@ -189,6 +204,7 @@ def __init__(self, model_name: str):
             output_hidden_states=False,
             torchscript=True,
             trust_remote_code=True,
+            token="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",
         )
         self.model.eval()
 
@@ -258,3 +274,151 @@ def get_vision_model(torch_model, import_args):
     test_input = torch.randn(int(import_args["batch_size"]), 3, *input_image_size)
     actual_out = model(test_input)
     return model, test_input, actual_out
+
+
+def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name):
+    flags = [
+        "--iree-input-type=torch",
+        "--mlir-print-debuginfo",
+        "--mlir-print-op-on-diagnostic=false",
+        "--iree-llvmcpu-target-cpu-features=host",
+        "--iree-llvmcpu-target-triple=x86_64-linux-gnu",
+        "--iree-stream-resource-index-bits=64",
+        "--iree-vm-target-index-bits=64",
+        "--iree-flow-inline-constants-max-byte-length=1",
+    ]
+    if device == "cpu":
+        flags.append("--iree-llvmcpu-enable-ukernels=all")
+        device = "llvm-cpu"
+    elif device == "vulkan":
+        flags.extend(
+            [
+                "--iree-hal-target-backends=vulkan-spirv",
+                "--iree-vulkan-target-triple=" + target_triple,
+                "--iree-stream-resource-max-allocation-size=" + max_alloc,
+            ]
+        )
+    elif device == "rocm":
+        flags.extend(
+            [
+                "--iree-hal-target-backends=rocm",
+                "--iree-rocm-target-chip=" + target_triple,
+                "--iree-rocm-link-bc=true",
+                "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
+                "--iree-vm-bytecode-module-strip-source-map=true",
+                "--iree-opt-strip-assertions=true",
+                "--iree-vm-target-truncate-unsupported-floats",
+            ]
+        )
+    elif device == "cuda":
+        flags.extend(
+            [
+                "--iree-hal-target-backends=cuda",
+                "--iree-hal-cuda-llvm-target-arch=" + target_triple,
+                "--iree-vm-bytecode-module-strip-source-map=true",
+                "--iree-vm-target-truncate-unsupported-floats",
+            ]
+        )
+    else:
+        print("incorrect device: ", device)
+
+    flatbuffer_blob = ireec.compile_str(
+        module_str,
+        target_backends=[device],
+        extra_args=flags,
+    )
+    with open(f"{safe_name}.vmfb", "wb+") as f:
+        f.write(flatbuffer_blob)
+    print("Saved to", safe_name + ".vmfb")
+
+
+def classic_flow(model, model_name, input, out, run_e2e, expected_err):
+    vmfb_name = model_name.replace("/", "_") + ".vmfb"
+    model.get_compiled_module(save_to=vmfb_name)
+
+    # if model is not supposed to run e2e, exit at this point (mlir has been uploaded)
+    if run_e2e is False:
+        assert expected_err > 0
+        return
+
+    # run inference using iree runtime
+    runner = vmfbRunner("local-task", vmfb_name)
+    inputs = [ireert.asdevicearray(runner.config.device, input)]
+    keys = list(runner.ctx.modules)
+    key = keys[len(keys) - 1]
+    results = runner.ctx.modules.__getattr__(key)["main"](*inputs)
+    err = utils.largest_error(out.cpu().detach().numpy(), results)
+    # cleanup
+    os.remove(vmfb_name)
+    # accuracy
+    assert err < expected_err
+
+
+def param_flow(model, model_name, model_type, input, out, run_e2e, expected_err):
+    weight_name = model_name.replace("/", "_") + ".safetensors"
+    mapper = {}
+    utils.save_external_weights(mapper, model.model, "safetensors", weight_name)
+
+    # seq2seq models differs from rest as it take two inputs (input_ids, decoder_input_ids)
+    if model_type == "hf_seq2seq":
+
+        class Seq2SeqModule(CompiledModule):
+            params = export_parameters(
+                model.model, external=True, external_scope="", name_mapper=mapper.get
+            )
+
+            def main(
+                self,
+                inp1=AbstractTensor(*(input[0].shape), dtype=input[0].dtype),
+                inp2=AbstractTensor(*(input[1].shape), dtype=input[1].dtype),
+            ):
+                return jittable(model.model.forward)(inp1, inp2)
+
+        inst = Seq2SeqModule(context=Context(), import_to="IMPORT")
+        module_str = str(CompiledModule.get_mlir_module(inst))
+    else:
+
+        class GlobalModule(CompiledModule):
+            params = export_parameters(
+                model.model, external=True, external_scope="", name_mapper=mapper.get
+            )
+
+            def main(self, inp=AbstractTensor(*input.shape, dtype=input.dtype)):
+                return jittable(model.model.forward)(inp)
+
+        inst = GlobalModule(context=Context(), import_to="IMPORT")
+        module_str = str(CompiledModule.get_mlir_module(inst))
+
+    mlir_name = model_name.replace("/", "_") + ".mlir"
+    with open(mlir_name, "w+") as f:
+        f.write(module_str)
+
+    model_name_upload = model_name.replace("/", "_")
+    turbine_tank.uploadToBlobStorage(
+        str(os.path.abspath(mlir_name)),
+        f"{model_name_upload}/{model_name_upload}-params.mlir",
+    )
+
+    os.remove(mlir_name)
+
+    if run_e2e is False:
+        assert expected_err > 0
+        return
+
+    vmfb_name = model_name.replace("/", "_")
+    tank_util.compile_to_vmfb(module_str, "cpu", "", "", vmfb_name)
+
+    # run inference using iree runtime
+    runner = vmfbRunner("local-task", vmfb_name + ".vmfb", weight_name)
+    inputs = [ireert.asdevicearray(runner.config.device, input)]
+    keys = list(runner.ctx.modules)
+    key = keys[len(keys) - 1]
+    results = runner.ctx.modules.__getattr__(key)["main"](*inputs)
+    err = utils.largest_error(out.cpu().detach().numpy(), results)
+
+    # clean up
+    os.remove(vmfb_name + ".vmfb")
+    os.remove(weight_name)
+
+    # accuracy
+    assert err < expected_err
diff --git a/models/turbine_models/turbine_tank/turbine_tank.py b/models/turbine_models/turbine_tank/turbine_tank.py
index 8150cc157..36dc07a4a 100644
--- a/models/turbine_models/turbine_tank/turbine_tank.py
+++ b/models/turbine_models/turbine_tank/turbine_tank.py
@@ -1,3 +1,9 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 from azure.storage.blob import BlobServiceClient
 
 import subprocess
@@ -65,7 +71,7 @@ def uploadToBlobStorage(file_path, file_name):
 
 
 def checkAndRemoveIfDownloadedOld(model_name: str, model_dir: str, prefix: str):
-    if os.path.isdir(model_dir) and len(os.listdir(model_dir)) == 1:
+    if os.path.isdir(model_dir) and len(os.listdir(model_dir)) > 0:
         for item in os.listdir(model_dir):
             item_path = os.path.join(model_dir, item)
             # model artifacts already downloaded and up to date
@@ -79,6 +85,12 @@ def checkAndRemoveIfDownloadedOld(model_name: str, model_dir: str, prefix: str):
                 os.remove(os.path.join(item_path, model_name + ".mlir"))
                 os.rmdir(item_path)
                 return False
+            if os.path.isdir(item_path) and os.path.isfile(
+                os.path.join(item_path, model_name + "-param.mlir")
+            ):
+                os.remove(os.path.join(item_path, model_name + "-param.mlir"))
+                os.rmdir(item_path)
+                return False
     # did not downloaded this model artifacts yet
     return False
 
@@ -104,9 +116,11 @@ def download_public_folder(model_name: str, prefix: str, model_dir: str):
         if not os.path.isdir(dest_path):
             os.makedirs(dest_path)
         # download blob into local turbine tank cache
-        with open(
-            file=os.path.join(model_dir, model_name + ".mlir"), mode="wb"
-        ) as sample_blob:
+        if "param" in blob.name:
+            file_path = os.path.join(model_dir, model_name + "-param.mlir")
+        else:
+            file_path = os.path.join(model_dir, model_name + ".mlir")
+        with open(file=file_path, mode="wb") as sample_blob:
             download_stream = blob_client.download_blob()
             sample_blob.write(download_stream.readall())
 

From f97e3ce09c6a7e099281033c09072eadfad875d5 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Sun, 25 Feb 2024 22:54:57 -0800
Subject: [PATCH 12/20] formatting

---
 models/turbine_models/tests/sd_test.py          | 6 +++---
 models/turbine_models/turbine_tank/tank_util.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index f555f5fdd..01887db72 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -262,9 +262,9 @@ def testExportPNDMScheduler(self):
                 "cpu",
             )
         self.assertEqual(cm.exception.code, None)
-        arguments["external_weight_path"] = (
-            "stable_diffusion_v1_4_scheduler.safetensors"
-        )
+        arguments[
+            "external_weight_path"
+        ] = "stable_diffusion_v1_4_scheduler.safetensors"
         arguments["vmfb_path"] = "stable_diffusion_v1_4_scheduler.vmfb"
         sample = torch.rand(
             arguments["batch_size"],
diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py
index 718caf9d7..90b80829c 100644
--- a/models/turbine_models/turbine_tank/tank_util.py
+++ b/models/turbine_models/turbine_tank/tank_util.py
@@ -134,7 +134,6 @@ def forward(self, tokens):
 
 
 def get_hf_model(name, import_args):
-
     model = HuggingFaceLanguage(name)
     test_input = torch.randint(2, (int(import_args["batch_size"]), 128))
     actual_out = model(test_input)

From 8f48a800b6f7af97477e343468de5a49a98db614 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Mon, 26 Feb 2024 22:27:10 -0800
Subject: [PATCH 13/20] remove debug

---
 models/turbine_models/turbine_tank/tank_util.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py
index 90b80829c..154d3dc34 100644
--- a/models/turbine_models/turbine_tank/tank_util.py
+++ b/models/turbine_models/turbine_tank/tank_util.py
@@ -103,7 +103,6 @@ def get_hf_img_cls_model(name, import_args):
     # print("test_input.shape: ", test_input.shape)
     # test_input.shape:  torch.Size([1, 3, 224, 224])
     test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1)
-    print(f"YOOO TEST INPUT: {test_input.shape}")
     actual_out = model(test_input)
     # actual_out.shape：  torch.Size([1, 1000])
     return model, test_input, actual_out

From 826e251eb93beb67fda0741e7db8fd96d90c294b Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 28 Feb 2024 14:00:49 -0800
Subject: [PATCH 14/20] moving turbine tank out to test suite

---
 .../tests/stateless_llama_test.py             |   8 +-
 .../turbine_models/turbine_tank/run_tank.py   |  61 ---
 .../turbine_models/turbine_tank/tank_test.py  | 134 ------
 .../turbine_models/turbine_tank/tank_util.py  | 422 ------------------
 4 files changed, 6 insertions(+), 619 deletions(-)
 delete mode 100644 models/turbine_models/turbine_tank/run_tank.py
 delete mode 100644 models/turbine_models/turbine_tank/tank_test.py
 delete mode 100644 models/turbine_models/turbine_tank/tank_util.py

diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py
index c72c55e55..1e87120fa 100644
--- a/models/turbine_models/tests/stateless_llama_test.py
+++ b/models/turbine_models/tests/stateless_llama_test.py
@@ -68,7 +68,9 @@ def test_vmfb_comparison(self):
             upload_ir=upload_ir_var == "upload",
         )
 
-        torch_str_cache_path = f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt"
+        torch_str_cache_path = (
+            f"vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt"
+        )
         # if cached, just read
         if os.path.exists(torch_str_cache_path):
             with open(torch_str_cache_path, "r") as f:
@@ -109,7 +111,9 @@ def test_streaming_vmfb_comparison(self):
             vmfb_path="streaming_llama.vmfb",
         )
 
-        torch_str_cache_path = f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt"
+        torch_str_cache_path = (
+            f"vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt"
+        )
         # if cached, just read
         if os.path.exists(torch_str_cache_path):
             with open(torch_str_cache_path, "r") as f:
diff --git a/models/turbine_models/turbine_tank/run_tank.py b/models/turbine_models/turbine_tank/run_tank.py
deleted file mode 100644
index a4e77db07..000000000
--- a/models/turbine_models/turbine_tank/run_tank.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import argparse
-import unittest
-from turbine_models.turbine_tank import tank_util
-
-import turbine_models.tests.sd_test as sd_test
-import os
-from turbine_models.turbine_tank import turbine_tank
-
-import pytest
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--download_ir",
-    action=argparse.BooleanOptionalAction,
-    default=False,
-    help="download IR from turbine tank",
-)
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-
-    if args.download_ir:
-        turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-clip")
-        turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-vae-decode")
-        turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-vae-encode")
-        turbine_tank.downloadModelArtifacts("CompVis/stable-diffusion-v1-4-unet")
-        turbine_tank.downloadModelArtifacts(
-            "Trelis/Llama-2-7b-chat-hf-function-calling-v2"
-        )
-        for model_name, _ in tank_util.model_list:
-            turbine_tank.downloadModelArtifacts(model_name)
-    else:
-        import turbine_models.tests.stateless_llama_test as stateless_llama_test
-
-        # environment variable used to let the llama/sd tests know we are running from tank and want to upload
-        os.environ["TURBINE_TANK_ACTION"] = "upload"
-
-        # run existing turbine llama and sd tests integrated with turbine tank
-        llama_suite = unittest.TestLoader().loadTestsFromModule(stateless_llama_test)
-        unittest.TextTestRunner(verbosity=2).run(llama_suite)
-
-        sd_suite = unittest.TestLoader().loadTestsFromModule(sd_test)
-        unittest.TextTestRunner(verbosity=2).run(sd_suite)
-
-        # cleanup
-        os.remove("Llama_2_7b_chat_hf_function_calling_v2_f32_unquantized.safetensors")
-        os.remove("Llama_2_7b_chat_hf_function_calling_v2.mlir")
-        os.remove("Llama_2_7b_chat_hf_function_calling_v2.vmfb")
-        os.remove("streaming_llama.vmfb")
-        os.remove("stable_diffusion_v1_4_clip.mlir")
-        os.remove("stable_diffusion_v1_4_unet.mlir")
-        os.remove("stable_diffusion_v1_4_vae.mlir")
-
-        # runs tank_test.py (only pytest file in this directory, runs 30 models e2e)
-        pytest.main(["-v", os.path.dirname(os.path.abspath(__file__))])
diff --git a/models/turbine_models/turbine_tank/tank_test.py b/models/turbine_models/turbine_tank/tank_test.py
deleted file mode 100644
index d1f825e46..000000000
--- a/models/turbine_models/turbine_tank/tank_test.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import pytest
-from turbine_models.turbine_tank import tank_util
-from turbine_models.model_builder import HFTransformerBuilder
-
-
-@pytest.mark.parametrize(
-    "model_name,model_type,expected_err,run_e2e",
-    [
-        ("microsoft/resnet-50", "hf_img_cls", 8e-05, True),
-        ("bert-large-uncased", "hf", 8e-06, True),
-        ("facebook/deit-small-distilled-patch16-224", "hf_img_cls", 8e-05, True),
-        ("google/vit-base-patch16-224", "hf_img_cls", 8e-05, True),
-        ("microsoft/beit-base-patch16-224-pt22k-ft22k", "hf_img_cls", 8e-05, True),
-        ("microsoft/MiniLM-L12-H384-uncased", "hf", 5e-07, True),
-        ("google/mobilebert-uncased", "hf", 4.3, True),
-        ("mobilenet_v3_small", "vision", 6e-05, True),
-        ("nvidia/mit-b0", "hf_img_cls", 7.3, True),
-        ("resnet101", "vision", 8e-06, True),
-        ("resnet18", "vision", 8e-06, True),
-        ("resnet50", "vision", 8e-06, True),
-        ("squeezenet1_0", "vision", 9e-06, True),
-        ("wide_resnet50_2", "vision", 9e-06, True),
-        ("mnasnet1_0", "vision", 2e-05, True),
-        pytest.param(
-            "t5-base",
-            "hf_seq2seq",
-            -1,
-            False,
-            marks=pytest.mark.xfail(reason="iree-compile fails"),
-        ),
-        pytest.param(
-            "t5-large",
-            "hf_seq2seq",
-            -1,
-            False,
-            marks=pytest.mark.xfail(reason="iree-compile fails"),
-        ),
-        ("openai/whisper-base", "hf_causallm", 9e-05, True),
-        ("openai/whisper-small", "hf_causallm", 0.0003, True),
-        ("openai/whisper-medium", "hf_causallm", 0.0003, True),
-        ("facebook/opt-350m", "hf", 9e-07, True),
-        ("facebook/opt-1.3b", "hf", 9e-06, True),
-        ("BAAI/bge-base-en-v1.5", "hf", 9e-07, True),
-        pytest.param(
-            "facebook/bart-large",
-            "hf_seq2seq",
-            -1,
-            False,
-            marks=pytest.mark.xfail(reason="iree-compile fails"),
-        ),
-        pytest.param(
-            "gpt2",
-            "hf",
-            -1,
-            False,
-            marks=pytest.mark.xfail(reason="iree-compile fails"),
-        ),
-        pytest.param(
-            "gpt2-xl",
-            "hf",
-            -1,
-            False,
-            marks=pytest.mark.xfail(reason="iree-compile fails"),
-        ),
-        ("lmsys/vicuna-13b-v1.3", "hf", 5e-05, True),
-        pytest.param(
-            "microsoft/phi-1_5",
-            "hf_causallm",
-            -1,
-            True,
-            marks=pytest.mark.xfail(reason="correctness issue"),
-        ),  # nan error reported (correctness issue)
-        pytest.param(
-            "microsoft/phi-2",
-            "hf_causallm",
-            -1,
-            True,
-            marks=pytest.mark.xfail(reason="correctness issue"),
-        ),  # nan error reported (correctness issue)
-        pytest.param(
-            "mosaicml/mpt-30b",
-            "hf_causallm",
-            -1,
-            False,
-            marks=pytest.mark.xfail(reason="iree-compile fails"),
-        ),
-        ("stabilityai/stablelm-3b-4e1t", "hf_causallm", 0.0004, True),
-    ],
-)
-def test_all_models(model_name, model_type, expected_err, run_e2e):
-    import_args = {
-        "batch_size": 1,
-    }
-
-    # Based on the model type, get the appropriate hugging face model, inputs, and output
-    if model_type == "vision":
-        torch_model, input, out = tank_util.get_vision_model(model_name, import_args)
-    elif model_type == "hf":
-        torch_model, input, out = tank_util.get_hf_model(model_name, import_args)
-    elif model_type == "hf_seq2seq":
-        torch_model, input, out = tank_util.get_hf_seq2seq_model(
-            model_name, import_args
-        )
-    elif model_type == "hf_causallm":
-        torch_model, input, out = tank_util.get_hf_causallm_model(
-            model_name, import_args
-        )
-    elif model_type == "hf_img_cls":
-        torch_model, input, out = tank_util.get_hf_img_cls_model(
-            model_name, import_args
-        )
-
-    # create hugging face transformer model
-    model = HFTransformerBuilder(
-        example_input=input,
-        hf_id=model_name,
-        upload_ir=True,
-        model=torch_model,
-        model_type=model_type,
-        run_e2e=run_e2e,
-    )
-
-    # runs using external params
-    tank_util.param_flow(
-        model, model_name, model_type, input, out, run_e2e, expected_err
-    )
-    # inline weights
-    tank_util.classic_flow(model, model_name, input, out, run_e2e, expected_err)
diff --git a/models/turbine_models/turbine_tank/tank_util.py b/models/turbine_models/turbine_tank/tank_util.py
deleted file mode 100644
index 154d3dc34..000000000
--- a/models/turbine_models/turbine_tank/tank_util.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import torch
-import iree.compiler as ireec
-import torch
-from turbine_models.turbine_tank import tank_util
-from turbine_models.model_runner import vmfbRunner
-from turbine_models.custom_models.sd_inference import utils
-from iree import runtime as ireert
-import os
-from shark_turbine.aot import *
-from iree.compiler.ir import Context
-from turbine_models.turbine_tank import turbine_tank
-
-torch.manual_seed(0)
-
-BATCH_SIZE = 1
-
-model_list = [
-    ("microsoft/resnet-50", "hf_img_cls"),
-    ("bert-large-uncased", "hf"),
-    ("facebook/deit-small-distilled-patch16-224", "hf_img_cls"),
-    ("google/vit-base-patch16-224", "hf_img_cls"),
-    ("microsoft/beit-base-patch16-224-pt22k-ft22k", "hf_img_cls"),
-    ("microsoft/MiniLM-L12-H384-uncased", "hf"),
-    ("google/mobilebert-uncased", "hf"),
-    ("mobilenet_v3_small", "vision"),
-    ("nvidia/mit-b0", "hf_img_cls"),
-    ("resnet101", "vision"),
-    ("resnet18", "vision"),
-    ("resnet50", "vision"),
-    ("squeezenet1_0", "vision"),
-    ("wide_resnet50_2", "vision"),
-    ("mnasnet1_0", "vision"),
-    ("t5-base", "hf_seq2seq"),  # iree-compile failure
-    ("t5-large", "hf_seq2seq"),  # iree-compile failure
-    ("openai/whisper-base", "hf_causallm"),
-    ("openai/whisper-small", "hf_causallm"),
-    ("openai/whisper-medium", "hf_causallm"),
-    ("facebook/opt-350m", "hf"),
-    ("facebook/opt-1.3b", "hf"),
-    ("BAAI/bge-base-en-v1.5", "hf"),
-    ("facebook/bart-large", "hf_seq2seq"),  # iree-compile fails
-    ("gpt2", "hf"),  # iree-compile fails
-    ("gpt2-xl", "hf"),  # iree-compile fails
-    ("lmsys/vicuna-13b-v1.3", "hf"),
-    ("microsoft/phi-1_5", "hf_causallm"),  # nan error reported (correctness issue)
-    ("microsoft/phi-2", "hf_causallm"),  # nan error reported (correctness issue)
-    ("mosaicml/mpt-30b", "hf_causallm"),  # iree-compile fails
-    ("stabilityai/stablelm-3b-4e1t", "hf_causallm"),
-]
-
-
-##################### Hugging Face Image Classification Models ###################################
-from transformers import AutoModelForImageClassification
-from transformers import AutoFeatureExtractor
-from PIL import Image
-import requests
-
-
-def preprocess_input_image(model_name):
-    # from datasets import load_dataset
-    # dataset = load_dataset("huggingface/cats-image")
-    # image1 = dataset["test"]["image"][0]
-    # # print("image1: ", image1) # <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7FA0B86BB6D0>
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    # <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7FA0B86BB6D0>
-    image = Image.open(requests.get(url, stream=True).raw)
-    # feature_extractor = img_models_fe_dict[model_name].from_pretrained(
-    #     model_name
-    # )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
-    inputs = feature_extractor(images=image, return_tensors="pt")
-    # inputs = {'pixel_values': tensor([[[[ 0.1137..., -0.2000, -0.4275, -0.5294]]]])}
-    #           torch.Size([1, 3, 224, 224]), torch.FloatTensor
-
-    return inputs[str(*inputs)]
-
-
-class HuggingFaceImageClassification(torch.nn.Module):
-    def __init__(self, hf_model_name):
-        super().__init__()
-        self.model = AutoModelForImageClassification.from_pretrained(
-            hf_model_name,  # The pretrained model.
-            output_attentions=False,  # Whether the model returns attentions weights.
-            return_dict=False,  # https://github.com/huggingface/transformers/issues/9095
-            torchscript=True,
-        )
-
-    def forward(self, inputs):
-        return self.model.forward(inputs)[0]
-
-
-def get_hf_img_cls_model(name, import_args):
-    model = HuggingFaceImageClassification(name)
-    # you can use preprocess_input_image to get the test_input or just random value.
-    test_input = preprocess_input_image(name)
-    # test_input = torch.FloatTensor(1, 3, 224, 224).uniform_(-1, 1)
-    # print("test_input.shape: ", test_input.shape)
-    # test_input.shape:  torch.Size([1, 3, 224, 224])
-    test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1)
-    actual_out = model(test_input)
-    # actual_out.shape：  torch.Size([1, 1000])
-    return model, test_input, actual_out
-
-
-##################### Hugging Face LM Models ###################################
-
-
-class HuggingFaceLanguage(torch.nn.Module):
-    def __init__(self, hf_model_name):
-        super().__init__()
-        from transformers import AutoModelForSequenceClassification, AutoTokenizer
-        import transformers as trf
-
-        transformers_path = trf.__path__[0]
-        hf_model_path = f"{transformers_path}/models/{hf_model_name}"
-        self.model = AutoModelForSequenceClassification.from_pretrained(
-            hf_model_name,  # The pretrained model.
-            num_labels=2,  # The number of output labels--2 for binary classification.
-            output_attentions=False,  # Whether the model returns attentions weights.
-            output_hidden_states=False,  # Whether the model returns all hidden-states.
-            torchscript=True,
-        )
-        self.model.config.pad_token_id = None
-
-    def forward(self, tokens):
-        return self.model.forward(tokens)[0]
-
-
-def get_hf_model(name, import_args):
-    model = HuggingFaceLanguage(name)
-    test_input = torch.randint(2, (int(import_args["batch_size"]), 128))
-    actual_out = model(test_input)
-    return model, test_input, actual_out
-
-
-##################### Hugging Face Seq2SeqLM Models ###################################
-
-# We use a maximum sequence length of 512 since this is the default used in the T5 config.
-T5_MAX_SEQUENCE_LENGTH = 512
-
-
-class HFSeq2SeqLanguageModel(torch.nn.Module):
-    def __init__(self, model_name):
-        super().__init__()
-        from transformers import AutoTokenizer, T5Model
-
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.tokenization_kwargs = {
-            "pad_to_multiple_of": T5_MAX_SEQUENCE_LENGTH,
-            "padding": True,
-            "return_tensors": "pt",
-        }
-        self.model = T5Model.from_pretrained(model_name, return_dict=True)
-
-    def preprocess_input(self, text):
-        return self.tokenizer(text, **self.tokenization_kwargs)
-
-    def forward(self, input_ids, decoder_input_ids):
-        return self.model.forward(input_ids, decoder_input_ids=decoder_input_ids)[0]
-
-
-def get_hf_seq2seq_model(name, import_args):
-    m = HFSeq2SeqLanguageModel(name)
-    encoded_input_ids = m.preprocess_input(
-        "Studies have been shown that owning a dog is good for you"
-    ).input_ids
-    decoder_input_ids = m.preprocess_input("Studies show that").input_ids
-    decoder_input_ids = m.model._shift_right(decoder_input_ids)
-
-    test_input = (encoded_input_ids, decoder_input_ids)
-    actual_out = m.forward(*test_input)
-    return m, test_input, actual_out
-
-
-##################### Hugging Face CausalLM Models ###################################
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-
-def prepare_sentence_tokens(hf_model: str, sentence: str):
-    tokenizer = AutoTokenizer.from_pretrained(
-        hf_model, token="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"
-    )
-    return torch.tensor([tokenizer.encode(sentence)])
-
-
-class HFCausalLM(torch.nn.Module):
-    def __init__(self, model_name: str):
-        super().__init__()
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,  # The pretrained model name.
-            # The number of output labels--2 for binary classification.
-            num_labels=2,
-            # Whether the model returns attentions weights.
-            output_attentions=False,
-            # Whether the model returns all hidden-states.
-            output_hidden_states=False,
-            torchscript=True,
-            trust_remote_code=True,
-            token="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",
-        )
-        self.model.eval()
-
-    def forward(self, tokens):
-        return self.model.forward(tokens)[0]
-
-
-def get_hf_causallm_model(name, import_args):
-    m = HFCausalLM(name)
-    test_input = prepare_sentence_tokens(name, "this project is very interesting")
-    actual_out = m.forward(test_input)
-    return m, test_input, actual_out
-
-
-################################################################################
-
-##################### Torch Vision Models    ###################################
-
-
-class VisionModule(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-        self.train(False)
-
-    def forward(self, input):
-        return self.model.forward(input)
-
-
-def get_vision_model(torch_model, import_args):
-    import torchvision.models as models
-
-    default_image_size = (224, 224)
-    modelname = torch_model
-    if modelname == "alexnet":
-        torch_model = models.alexnet(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "resnet18":
-        torch_model = models.resnet18(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "resnet50":
-        torch_model = models.resnet50(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "resnet50_fp16":
-        torch_model = models.resnet50(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "resnet50_fp16":
-        torch_model = models.resnet50(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "resnet101":
-        torch_model = models.resnet101(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "squeezenet1_0":
-        torch_model = models.squeezenet1_0(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "wide_resnet50_2":
-        torch_model = models.wide_resnet50_2(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "mobilenet_v3_small":
-        torch_model = models.mobilenet_v3_small(weights="DEFAULT")
-        input_image_size = default_image_size
-    if modelname == "mnasnet1_0":
-        torch_model = models.mnasnet1_0(weights="DEFAULT")
-        input_image_size = default_image_size
-
-    model = VisionModule(torch_model)
-    test_input = torch.randn(int(import_args["batch_size"]), 3, *input_image_size)
-    actual_out = model(test_input)
-    return model, test_input, actual_out
-
-
-def compile_to_vmfb(module_str, device, target_triple, max_alloc, safe_name):
-    flags = [
-        "--iree-input-type=torch",
-        "--mlir-print-debuginfo",
-        "--mlir-print-op-on-diagnostic=false",
-        "--iree-llvmcpu-target-cpu-features=host",
-        "--iree-llvmcpu-target-triple=x86_64-linux-gnu",
-        "--iree-stream-resource-index-bits=64",
-        "--iree-vm-target-index-bits=64",
-        "--iree-flow-inline-constants-max-byte-length=1",
-    ]
-    if device == "cpu":
-        flags.append("--iree-llvmcpu-enable-ukernels=all")
-        device = "llvm-cpu"
-    elif device == "vulkan":
-        flags.extend(
-            [
-                "--iree-hal-target-backends=vulkan-spirv",
-                "--iree-vulkan-target-triple=" + target_triple,
-                "--iree-stream-resource-max-allocation-size=" + max_alloc,
-            ]
-        )
-    elif device == "rocm":
-        flags.extend(
-            [
-                "--iree-hal-target-backends=rocm",
-                "--iree-rocm-target-chip=" + target_triple,
-                "--iree-rocm-link-bc=true",
-                "--iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode",
-                "--iree-vm-bytecode-module-strip-source-map=true",
-                "--iree-opt-strip-assertions=true",
-                "--iree-vm-target-truncate-unsupported-floats",
-            ]
-        )
-    elif device == "cuda":
-        flags.extend(
-            [
-                "--iree-hal-target-backends=cuda",
-                "--iree-hal-cuda-llvm-target-arch=" + target_triple,
-                "--iree-vm-bytecode-module-strip-source-map=true",
-                "--iree-vm-target-truncate-unsupported-floats",
-            ]
-        )
-    else:
-        print("incorrect device: ", device)
-
-    flatbuffer_blob = ireec.compile_str(
-        module_str,
-        target_backends=[device],
-        extra_args=flags,
-    )
-    with open(f"{safe_name}.vmfb", "wb+") as f:
-        f.write(flatbuffer_blob)
-    print("Saved to", safe_name + ".vmfb")
-
-
-def classic_flow(model, model_name, input, out, run_e2e, expected_err):
-    vmfb_name = model_name.replace("/", "_") + ".vmfb"
-    model.get_compiled_module(save_to=vmfb_name)
-
-    # if model is not supposed to run e2e, exit at this point (mlir has been uploaded)
-    if run_e2e is False:
-        assert expected_err > 0
-        return
-
-    # run inference using iree runtime
-    runner = vmfbRunner("local-task", vmfb_name)
-    inputs = [ireert.asdevicearray(runner.config.device, input)]
-    keys = list(runner.ctx.modules)
-    key = keys[len(keys) - 1]
-    results = runner.ctx.modules.__getattr__(key)["main"](*inputs)
-    err = utils.largest_error(out.cpu().detach().numpy(), results)
-    # cleanup
-    os.remove(vmfb_name)
-    # accuracy
-    assert err < expected_err
-
-
-def param_flow(model, model_name, model_type, input, out, run_e2e, expected_err):
-    weight_name = model_name.replace("/", "_") + ".safetensors"
-    mapper = {}
-    utils.save_external_weights(mapper, model.model, "safetensors", weight_name)
-
-    # seq2seq models differs from rest as it take two inputs (input_ids, decoder_input_ids)
-    if model_type == "hf_seq2seq":
-
-        class Seq2SeqModule(CompiledModule):
-            params = export_parameters(
-                model.model, external=True, external_scope="", name_mapper=mapper.get
-            )
-
-            def main(
-                self,
-                inp1=AbstractTensor(*(input[0].shape), dtype=input[0].dtype),
-                inp2=AbstractTensor(*(input[1].shape), dtype=input[1].dtype),
-            ):
-                return jittable(model.model.forward)(inp1, inp2)
-
-        inst = Seq2SeqModule(context=Context(), import_to="IMPORT")
-        module_str = str(CompiledModule.get_mlir_module(inst))
-    else:
-
-        class GlobalModule(CompiledModule):
-            params = export_parameters(
-                model.model, external=True, external_scope="", name_mapper=mapper.get
-            )
-
-            def main(self, inp=AbstractTensor(*input.shape, dtype=input.dtype)):
-                return jittable(model.model.forward)(inp)
-
-        inst = GlobalModule(context=Context(), import_to="IMPORT")
-        module_str = str(CompiledModule.get_mlir_module(inst))
-
-    mlir_name = model_name.replace("/", "_") + ".mlir"
-    with open(mlir_name, "w+") as f:
-        f.write(module_str)
-
-    model_name_upload = model_name.replace("/", "_")
-    turbine_tank.uploadToBlobStorage(
-        str(os.path.abspath(mlir_name)),
-        f"{model_name_upload}/{model_name_upload}-params.mlir",
-    )
-
-    os.remove(mlir_name)
-
-    if run_e2e is False:
-        assert expected_err > 0
-        return
-
-    vmfb_name = model_name.replace("/", "_")
-    tank_util.compile_to_vmfb(module_str, "cpu", "", "", vmfb_name)
-
-    # run inference using iree runtime
-    runner = vmfbRunner("local-task", vmfb_name + ".vmfb", weight_name)
-    inputs = [ireert.asdevicearray(runner.config.device, input)]
-    keys = list(runner.ctx.modules)
-    key = keys[len(keys) - 1]
-    results = runner.ctx.modules.__getattr__(key)["main"](*inputs)
-    err = utils.largest_error(out.cpu().detach().numpy(), results)
-
-    # clean up
-    os.remove(vmfb_name + ".vmfb")
-    os.remove(weight_name)
-
-    # accuracy
-    assert err < expected_err

From 821ecf329317afec096c7c32c1ccf3f0a7075e5f Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 28 Feb 2024 14:14:15 -0800
Subject: [PATCH 15/20] add for schedulers too

---
 .../custom_models/sd_inference/schedulers.py         | 12 ++++++++++++
 models/turbine_models/tests/sd_test.py               |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 97bd2418f..a9475d080 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -23,6 +23,8 @@
 import safetensors
 import argparse
 
+from turbine_models.turbine_tank import turbine_tank
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--hf_auth_token", type=str, help="The Hugging Face auth token, required"
@@ -111,6 +113,7 @@ def export_scheduler(
     device=None,
     target_triple=None,
     max_alloc=None,
+    upload_ir=False,
 ):
     mapper = {}
     utils.save_external_weights(
@@ -145,6 +148,15 @@ def main(
 
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = utils.create_safe_name(hf_model_name, "-scheduler")
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload = model_name_upload + "-scheduler"
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str
     else:
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
index 01887db72..9d00fb9e5 100644
--- a/models/turbine_models/tests/sd_test.py
+++ b/models/turbine_models/tests/sd_test.py
@@ -247,6 +247,7 @@ def testExportVaeModelEncode(self):
 
     @unittest.expectedFailure
     def testExportPNDMScheduler(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             schedulers.export_scheduler(
                 scheduler_module,
@@ -260,6 +261,7 @@ def testExportPNDMScheduler(self):
                 "safetensors",
                 "stable_diffusion_v1_4_scheduler.safetensors",
                 "cpu",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments[

From 8bcac10332bf66147b95841684878652288cd87b Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 28 Feb 2024 14:25:25 -0800
Subject: [PATCH 16/20] better var name

---
 models/turbine_models/model_builder.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py
index 2577a39b5..f91c65a08 100644
--- a/models/turbine_models/model_builder.py
+++ b/models/turbine_models/model_builder.py
@@ -29,7 +29,7 @@ def __init__(
         upload_ir=False,
         model=None,
         model_type: str = None,
-        run_e2e: bool = None,
+        compile_to_vmfb: bool = None,
     ) -> None:
         self.example_input = example_input
         self.hf_id = hf_id
@@ -41,7 +41,7 @@ def __init__(
         self.tokenizer = None
         self.upload_ir = upload_ir
         self.model_type = model_type
-        self.run_e2e = run_e2e
+        self.compile_to_vmfb = compile_to_vmfb
         if self.model == None:
             self.build_model()
 
@@ -88,7 +88,7 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule:
                     f"{model_name_upload}/{model_name_upload}.mlir",
                 )
                 os.remove(f"{safe_name}.mlir")
-            if self.run_e2e is not None and self.run_e2e is False:
+            if self.compile_to_vmfb is not None and self.compile_to_vmfb is False:
                 return
             compiled_binary = module.compile(save_to=save_to)
             return compiled_binary

From 7736efcd68055769217c1cff0f881791582985f4 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 28 Feb 2024 14:26:29 -0800
Subject: [PATCH 17/20] empty init file

---
 models/turbine_models/turbine_tank/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 models/turbine_models/turbine_tank/__init__.py

diff --git a/models/turbine_models/turbine_tank/__init__.py b/models/turbine_models/turbine_tank/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 45a3a275370bacd6e58e7637aeeb4efd6859fa4a Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 28 Feb 2024 16:29:40 -0800
Subject: [PATCH 18/20] clean checks

---
 models/turbine_models/model_builder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py
index f91c65a08..035244534 100644
--- a/models/turbine_models/model_builder.py
+++ b/models/turbine_models/model_builder.py
@@ -71,7 +71,7 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule:
         Returns:
             aot.CompiledModule: The compiled module binary.
         """
-        if self.model_type == "hf_seq2seq":
+        if self.model_type and self.model_type == "hf_seq2seq":
             module = aot.export(self.model, *self.example_input)
         else:
             module = aot.export(self.model, self.example_input)
@@ -88,7 +88,7 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule:
                     f"{model_name_upload}/{model_name_upload}.mlir",
                 )
                 os.remove(f"{safe_name}.mlir")
-            if self.compile_to_vmfb is not None and self.compile_to_vmfb is False:
+            if self.compile_to_vmfb and not self.compile_to_vmfb:
                 return
             compiled_binary = module.compile(save_to=save_to)
             return compiled_binary

From a898364e7d6bc6bc61e4a20dd1a73cf4880be47c Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 28 Feb 2024 18:16:14 -0800
Subject: [PATCH 19/20] address nit

---
 models/turbine_models/custom_models/sd_inference/clip.py       | 2 +-
 models/turbine_models/custom_models/sd_inference/schedulers.py | 2 +-
 models/turbine_models/custom_models/sd_inference/unet.py       | 2 +-
 models/turbine_models/custom_models/sd_inference/vae.py        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index 4cc5f91dd..b37eaf847 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -101,7 +101,7 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
         model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload += "-clip"
+        model_name_upload += "_clip"
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
             f"{model_name_upload}/{model_name_upload}.mlir",
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index a9475d080..951187524 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -152,7 +152,7 @@ def main(
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
         model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload = model_name_upload + "-scheduler"
+        model_name_upload = model_name_upload + "_scheduler"
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
             f"{model_name_upload}/{model_name_upload}.mlir",
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 2c1556e84..1b351a078 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -131,7 +131,7 @@ def main(
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
         model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload += "-unet"
+        model_name_upload += "_unet"
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
             f"{model_name_upload}/{model_name_upload}.mlir",
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index fcf9453b4..e169187ea 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -119,7 +119,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
         model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload = model_name_upload + "-vae-" + variant
+        model_name_upload = model_name_upload + "_vae_" + variant
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
             f"{model_name_upload}/{model_name_upload}.mlir",

From 4d7edfa1c5450a74462579fc5b96660bffb1ed84 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Wed, 28 Feb 2024 18:39:17 -0800
Subject: [PATCH 20/20] revert nit

---
 models/turbine_models/custom_models/sd_inference/clip.py       | 2 +-
 models/turbine_models/custom_models/sd_inference/schedulers.py | 2 +-
 models/turbine_models/custom_models/sd_inference/unet.py       | 2 +-
 models/turbine_models/custom_models/sd_inference/vae.py        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
index b37eaf847..4cc5f91dd 100644
--- a/models/turbine_models/custom_models/sd_inference/clip.py
+++ b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -101,7 +101,7 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
         model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload += "_clip"
+        model_name_upload += "-clip"
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
             f"{model_name_upload}/{model_name_upload}.mlir",
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
index 951187524..6dafeb313 100644
--- a/models/turbine_models/custom_models/sd_inference/schedulers.py
+++ b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -151,7 +151,7 @@ def main(
     if upload_ir:
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
-        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload = hf_model_name.replace("/", "-")
         model_name_upload = model_name_upload + "_scheduler"
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
index 1b351a078..398ed9bc5 100644
--- a/models/turbine_models/custom_models/sd_inference/unet.py
+++ b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -130,7 +130,7 @@ def main(
     if upload_ir:
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
-        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload = hf_model_name.replace("/", "-")
         model_name_upload += "_unet"
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
index e169187ea..fcf9453b4 100644
--- a/models/turbine_models/custom_models/sd_inference/vae.py
+++ b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -119,7 +119,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
         with open(f"{safe_name}.mlir", "w+") as f:
             f.write(module_str)
         model_name_upload = hf_model_name.replace("/", "_")
-        model_name_upload = model_name_upload + "_vae_" + variant
+        model_name_upload = model_name_upload + "-vae-" + variant
         turbine_tank.uploadToBlobStorage(
             str(os.path.abspath(f"{safe_name}.mlir")),
             f"{model_name_upload}/{model_name_upload}.mlir",