nod-ai · saienduri · Feb 29, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/models/requirements.txt b/models/requirements.txt
@@ -5,3 +5,7 @@ transformers
 accelerate
 diffusers==0.24.0
 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
+# turbine tank downloading/uploading
+azure-storage-blob
+# microsoft/phi model
+einops
diff --git a/models/setup.py b/models/setup.py
@@ -61,5 +61,7 @@ def load_version_info():
         "transformers",
         "accelerate",
         "diffusers==0.24.0",
+        "azure-storage-blob",
+        "einops",
     ],
 )
diff --git a/models/turbine_models/custom_models/sd_inference/clip.py b/models/turbine_models/custom_models/sd_inference/clip.py
@@ -16,6 +16,7 @@
 import torch
 import torch._dynamo as dynamo
 from transformers import CLIPTextModel, CLIPTokenizer
+from turbine_models.turbine_tank import turbine_tank
 
 import argparse
 
@@ -46,6 +47,12 @@
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="upload IR to turbine tank",
+)
 
 
 def export_clip_model(
@@ -57,13 +64,15 @@ def export_clip_model(
     device=None,
     target_triple=None,
     max_alloc=None,
+    upload_ir=False,
 ):
     # Load the tokenizer and text encoder to tokenize and encode the text.
     tokenizer = CLIPTokenizer.from_pretrained(
         hf_model_name,
         subfolder="tokenizer",
         token=hf_auth_token,
     )
+
     text_encoder_model = CLIPTextModel.from_pretrained(
         hf_model_name,
         subfolder="text_encoder",
@@ -94,6 +103,15 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
 
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = utils.create_safe_name(hf_model_name, "-clip")
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload += "-clip"
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str, tokenizer
     else:
@@ -111,6 +129,7 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
         args.device,
         args.iree_target_triple,
         args.vulkan_max_allocation,
+        args.upload_ir,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)

diff --git a/models/turbine_models/custom_models/sd_inference/unet.py b/models/turbine_models/custom_models/sd_inference/unet.py
@@ -18,6 +18,7 @@
 
 import safetensors
 import argparse
+from turbine_models.turbine_tank import turbine_tank
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -53,6 +54,12 @@
     help="Specify vulkan target triple or rocm/cuda target device.",
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="upload IR to turbine tank",
+)
 
 
 class UnetModel(torch.nn.Module):
@@ -90,6 +97,7 @@ def export_unet_model(
     device=None,
     target_triple=None,
     max_alloc=None,
+    upload_ir=False,
 ):
     mapper = {}
     utils.save_external_weights(
@@ -125,6 +133,15 @@ def main(
 
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = utils.create_safe_name(hf_model_name, "-unet")
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload += "-unet"
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str
     else:
@@ -150,6 +167,7 @@ def main(
         args.device,
         args.iree_target_triple,
         args.vulkan_max_allocation,
+        args.upload_ir,
     )
     safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
     with open(f"{safe_name}.mlir", "w+") as f:

diff --git a/models/turbine_models/custom_models/sd_inference/vae.py b/models/turbine_models/custom_models/sd_inference/vae.py
@@ -18,6 +18,7 @@
 
 import safetensors
 import argparse
+from turbine_models.turbine_tank import turbine_tank
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -54,6 +55,12 @@
 )
 parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
 parser.add_argument("--variant", type=str, default="decode")
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="upload IR to turbine tank",
+)
 
 
 class VaeModel(torch.nn.Module):
@@ -89,6 +96,7 @@ def export_vae_model(
     target_triple=None,
     max_alloc=None,
     variant="decode",
+    upload_ir=False,
 ):
     mapper = {}
     utils.save_external_weights(
@@ -113,6 +121,15 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
 
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = utils.create_safe_name(hf_model_name, "-vae")
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        model_name_upload = model_name_upload + "-vae-" + variant
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str
     else:
@@ -139,6 +156,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
         args.iree_target_triple,
         args.vulkan_max_allocation,
         args.variant,
+        args.upload_ir,
     )
     safe_name = utils.create_safe_name(args.hf_model_name, "-vae")
     with open(f"{safe_name}.mlir", "w+") as f:

diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
@@ -2,6 +2,7 @@
 import sys
 import re
 import json
+from turbine_models.turbine_tank import turbine_tank
 
 os.environ["TORCH_LOGS"] = "dynamic"
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -61,6 +62,12 @@
     action="store_true",
     help="Compile LLM with StreamingLLM optimizations",
 )
+parser.add_argument(
+    "--upload_ir",
+    action=argparse.BooleanOptionalAction,
+    default=False,
+    help="upload IR to turbine tank",
+)
 
 
 def generate_schema(num_layers):
@@ -107,7 +114,14 @@ def export_transformer_model(
     vulkan_max_allocation=None,
     streaming_llm=False,
     vmfb_path=None,
+    upload_ir=False,
 ):
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_model_name,
+        use_fast=False,
+        token=hf_auth_token,
+    )
+
     mod = AutoModelForCausalLM.from_pretrained(
         hf_model_name,
         torch_dtype=torch.float,
@@ -121,11 +135,7 @@ def export_transformer_model(
     if precision == "f16":
         mod = mod.half()
         dtype = torch.float16
-    tokenizer = AutoTokenizer.from_pretrained(
-        hf_model_name,
-        use_fast=False,
-        token=hf_auth_token,
-    )
+
     # TODO: generate these values instead of magic numbers
     NUM_LAYERS = mod.config.num_hidden_layers
     HEADS = getattr(mod.config, "num_key_value_heads", None)
@@ -319,6 +329,14 @@ def evict_kvcache_space(self):
     module_str = str(CompiledModule.get_mlir_module(inst))
     safe_name = hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)
+    if upload_ir:
+        with open(f"{safe_name}.mlir", "w+") as f:
+            f.write(module_str)
+        model_name_upload = hf_model_name.replace("/", "_")
+        turbine_tank.uploadToBlobStorage(
+            str(os.path.abspath(f"{safe_name}.mlir")),
+            f"{model_name_upload}/{model_name_upload}.mlir",
+        )
     if compile_to != "vmfb":
         return module_str, tokenizer
     else:
@@ -395,6 +413,7 @@ def evict_kvcache_space(self):
         args.vulkan_max_allocation,
         args.streaming_llm,
         args.vmfb_path,
+        args.upload_ir,
     )
     safe_name = args.hf_model_name.split("/")[-1].strip()
     safe_name = re.sub("-", "_", safe_name)

diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py
@@ -1,6 +1,9 @@
 from transformers import AutoModel, AutoTokenizer, AutoConfig
 import torch
 import shark_turbine.aot as aot
+from turbine_models.turbine_tank import turbine_tank
+import os
+import re
 
 
 class HFTransformerBuilder:
@@ -23,16 +26,24 @@ def __init__(
         auto_tokenizer: AutoTokenizer = None,
         auto_config: AutoConfig = None,
         hf_auth_token=None,
+        upload_ir=False,
+        model=None,
+        model_type: str = None,
+        run_e2e: bool = None,
     ) -> None:
         self.example_input = example_input
         self.hf_id = hf_id
         self.auto_model = auto_model
         self.auto_tokenizer = auto_tokenizer
         self.auto_config = auto_config
         self.hf_auth_token = hf_auth_token
-        self.model = None
+        self.model = model
         self.tokenizer = None
-        self.build_model()
+        self.upload_ir = upload_ir
+        self.model_type = model_type
+        self.run_e2e = run_e2e
+        if self.model == None:
+            self.build_model()
 
     def build_model(self) -> None:
         """
@@ -59,6 +70,23 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule:
         Returns:
             aot.CompiledModule: The compiled module binary.
         """
-        module = aot.export(self.model, self.example_input)
+        if self.model_type == "hf_seq2seq":
+            module = aot.export(self.model, *self.example_input)
+        else:
+            module = aot.export(self.model, self.example_input)
+        module_str = str(module.mlir_module)
+        safe_name = self.hf_id.split("/")[-1].strip()
+        safe_name = re.sub("-", "_", safe_name)
+        if self.upload_ir:
+            with open(f"{safe_name}.mlir", "w+") as f:
+                f.write(module_str)
+            model_name_upload = self.hf_id.replace("/", "_")
+            turbine_tank.uploadToBlobStorage(
+                str(os.path.abspath(f"{safe_name}.mlir")),
+                f"{model_name_upload}/{model_name_upload}.mlir",
+            )
+            os.remove(f"{safe_name}.mlir")
+        if self.run_e2e is not None and self.run_e2e is False:
+            return
         compiled_binary = module.compile(save_to=save_to)
         return compiled_binary
diff --git a/models/turbine_models/tests/sd_test.py b/models/turbine_models/tests/sd_test.py
@@ -55,6 +55,7 @@
 
 class StableDiffusionTest(unittest.TestCase):
     def testExportClipModel(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             clip.export_clip_model(
                 # This is a public model, so no auth required
@@ -64,6 +65,7 @@ def testExportClipModel(self):
                 "safetensors",
                 "stable_diffusion_v1_4_clip.safetensors",
                 "cpu",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors"
@@ -85,6 +87,7 @@ def testExportClipModel(self):
         os.remove("stable_diffusion_v1_4_clip.vmfb")
 
     def testExportUnetModel(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             unet.export_unet_model(
                 unet_model,
@@ -98,6 +101,7 @@ def testExportUnetModel(self):
                 "safetensors",
                 "stable_diffusion_v1_4_unet.safetensors",
                 "cpu",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors"
@@ -135,6 +139,7 @@ def testExportUnetModel(self):
         os.remove("stable_diffusion_v1_4_unet.vmfb")
 
     def testExportVaeModelDecode(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             vae.export_vae_model(
                 vae_model,
@@ -149,6 +154,7 @@ def testExportVaeModelDecode(self):
                 "stable_diffusion_v1_4_vae.safetensors",
                 "cpu",
                 variant="decode",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
@@ -180,6 +186,7 @@ def testExportVaeModelDecode(self):
         os.remove("stable_diffusion_v1_4_vae.vmfb")
 
     def testExportVaeModelEncode(self):
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
         with self.assertRaises(SystemExit) as cm:
             vae.export_vae_model(
                 vae_model,
@@ -194,6 +201,7 @@ def testExportVaeModelEncode(self):
                 "stable_diffusion_v1_4_vae.safetensors",
                 "cpu",
                 variant="encode",
+                upload_ir=upload_ir_var == "upload",
             )
         self.assertEqual(cm.exception.code, None)
         arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"

diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py
@@ -53,6 +53,8 @@ def test_vmfb_comparison(self):
         For VMFB, quantization can be int4 or None, but right now only using none for compatibility with torch.
         """
 
+        upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
+
         llama.export_transformer_model(
             hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             hf_auth_token=None,
@@ -63,6 +65,7 @@ def test_vmfb_comparison(self):
             precision=precision,
             device="llvm-cpu",
             target_triple="host",
+            upload_ir=upload_ir_var == "upload",
         )
 
         torch_str_cache_path = f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt"