Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Turbine Tank Turbine Changes #437

Merged
merged 24 commits into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions models/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@ transformers
accelerate
diffusers==0.24.0
brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
# turbine tank downloading/uploading
azure-storage-blob
# microsoft/phi model
einops
2 changes: 2 additions & 0 deletions models/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,7 @@ def load_version_info():
"transformers",
"accelerate",
"diffusers==0.24.0",
"azure-storage-blob",
"einops",
],
)
19 changes: 19 additions & 0 deletions models/turbine_models/custom_models/sd_inference/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import torch
import torch._dynamo as dynamo
from transformers import CLIPTextModel, CLIPTokenizer
from turbine_models.turbine_tank import turbine_tank

import argparse

Expand Down Expand Up @@ -46,6 +47,12 @@
help="Specify vulkan target triple or rocm/cuda target device.",
)
parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
parser.add_argument(
saienduri marked this conversation as resolved.
Show resolved Hide resolved
"--upload_ir",
action=argparse.BooleanOptionalAction,
default=False,
help="upload IR to turbine tank",
)


def export_clip_model(
Expand All @@ -57,13 +64,15 @@ def export_clip_model(
device=None,
target_triple=None,
max_alloc=None,
upload_ir=False,
):
# Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained(
hf_model_name,
subfolder="tokenizer",
token=hf_auth_token,
)

text_encoder_model = CLIPTextModel.from_pretrained(
hf_model_name,
subfolder="text_encoder",
Expand Down Expand Up @@ -94,6 +103,15 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):

module_str = str(CompiledModule.get_mlir_module(inst))
safe_name = utils.create_safe_name(hf_model_name, "-clip")
if upload_ir:
with open(f"{safe_name}.mlir", "w+") as f:
f.write(module_str)
model_name_upload = hf_model_name.replace("/", "_")
model_name_upload += "-clip"
turbine_tank.uploadToBlobStorage(
str(os.path.abspath(f"{safe_name}.mlir")),
f"{model_name_upload}/{model_name_upload}.mlir",
)
if compile_to != "vmfb":
return module_str, tokenizer
else:
Expand All @@ -111,6 +129,7 @@ def main(self, inp=AbstractTensor(1, 77, dtype=torch.int64)):
args.device,
args.iree_target_triple,
args.vulkan_max_allocation,
args.upload_ir,
)
safe_name = args.hf_model_name.split("/")[-1].strip()
safe_name = re.sub("-", "_", safe_name)
Expand Down
18 changes: 18 additions & 0 deletions models/turbine_models/custom_models/sd_inference/unet.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import safetensors
import argparse
from turbine_models.turbine_tank import turbine_tank

parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -53,6 +54,12 @@
help="Specify vulkan target triple or rocm/cuda target device.",
)
parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
parser.add_argument(
saienduri marked this conversation as resolved.
Show resolved Hide resolved
"--upload_ir",
action=argparse.BooleanOptionalAction,
default=False,
help="upload IR to turbine tank",
)


class UnetModel(torch.nn.Module):
Expand Down Expand Up @@ -90,6 +97,7 @@ def export_unet_model(
device=None,
target_triple=None,
max_alloc=None,
upload_ir=False,
):
mapper = {}
utils.save_external_weights(
Expand Down Expand Up @@ -125,6 +133,15 @@ def main(

module_str = str(CompiledModule.get_mlir_module(inst))
safe_name = utils.create_safe_name(hf_model_name, "-unet")
if upload_ir:
with open(f"{safe_name}.mlir", "w+") as f:
f.write(module_str)
model_name_upload = hf_model_name.replace("/", "_")
model_name_upload += "-unet"
turbine_tank.uploadToBlobStorage(
str(os.path.abspath(f"{safe_name}.mlir")),
f"{model_name_upload}/{model_name_upload}.mlir",
)
if compile_to != "vmfb":
return module_str
else:
Expand All @@ -150,6 +167,7 @@ def main(
args.device,
args.iree_target_triple,
args.vulkan_max_allocation,
args.upload_ir,
)
safe_name = utils.create_safe_name(args.hf_model_name, "-unet")
with open(f"{safe_name}.mlir", "w+") as f:
Expand Down
18 changes: 18 additions & 0 deletions models/turbine_models/custom_models/sd_inference/vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import safetensors
import argparse
from turbine_models.turbine_tank import turbine_tank

parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -54,6 +55,12 @@
)
parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
parser.add_argument("--variant", type=str, default="decode")
parser.add_argument(
saienduri marked this conversation as resolved.
Show resolved Hide resolved
"--upload_ir",
action=argparse.BooleanOptionalAction,
default=False,
help="upload IR to turbine tank",
)


class VaeModel(torch.nn.Module):
Expand Down Expand Up @@ -89,6 +96,7 @@ def export_vae_model(
target_triple=None,
max_alloc=None,
variant="decode",
upload_ir=False,
):
mapper = {}
utils.save_external_weights(
Expand All @@ -113,6 +121,15 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):

module_str = str(CompiledModule.get_mlir_module(inst))
safe_name = utils.create_safe_name(hf_model_name, "-vae")
if upload_ir:
with open(f"{safe_name}.mlir", "w+") as f:
f.write(module_str)
model_name_upload = hf_model_name.replace("/", "_")
model_name_upload = model_name_upload + "-vae-" + variant
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: use underscores instead of dashes for consistency here and elsewhere

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the underscore is only used to separate the org and model name. Rest is all '-' (CompVis_stable-diffusion-v1-4-vae-decode)

turbine_tank.uploadToBlobStorage(
str(os.path.abspath(f"{safe_name}.mlir")),
f"{model_name_upload}/{model_name_upload}.mlir",
)
if compile_to != "vmfb":
return module_str
else:
Expand All @@ -139,6 +156,7 @@ def main(self, inp=AbstractTensor(*sample, dtype=torch.float32)):
args.iree_target_triple,
args.vulkan_max_allocation,
args.variant,
args.upload_ir,
)
safe_name = utils.create_safe_name(args.hf_model_name, "-vae")
with open(f"{safe_name}.mlir", "w+") as f:
Expand Down
29 changes: 24 additions & 5 deletions models/turbine_models/custom_models/stateless_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sys
import re
import json
from turbine_models.turbine_tank import turbine_tank

os.environ["TORCH_LOGS"] = "dynamic"
from transformers import AutoTokenizer, AutoModelForCausalLM
Expand Down Expand Up @@ -61,6 +62,12 @@
action="store_true",
help="Compile LLM with StreamingLLM optimizations",
)
parser.add_argument(
"--upload_ir",
action=argparse.BooleanOptionalAction,
default=False,
help="upload IR to turbine tank",
)


def generate_schema(num_layers):
Expand Down Expand Up @@ -107,7 +114,14 @@ def export_transformer_model(
vulkan_max_allocation=None,
streaming_llm=False,
vmfb_path=None,
upload_ir=False,
):
tokenizer = AutoTokenizer.from_pretrained(
hf_model_name,
use_fast=False,
token=hf_auth_token,
)

mod = AutoModelForCausalLM.from_pretrained(
hf_model_name,
torch_dtype=torch.float,
Expand All @@ -121,11 +135,7 @@ def export_transformer_model(
if precision == "f16":
mod = mod.half()
dtype = torch.float16
tokenizer = AutoTokenizer.from_pretrained(
hf_model_name,
use_fast=False,
token=hf_auth_token,
)

# TODO: generate these values instead of magic numbers
NUM_LAYERS = mod.config.num_hidden_layers
HEADS = getattr(mod.config, "num_key_value_heads", None)
Expand Down Expand Up @@ -319,6 +329,14 @@ def evict_kvcache_space(self):
module_str = str(CompiledModule.get_mlir_module(inst))
safe_name = hf_model_name.split("/")[-1].strip()
safe_name = re.sub("-", "_", safe_name)
if upload_ir:
with open(f"{safe_name}.mlir", "w+") as f:
f.write(module_str)
model_name_upload = hf_model_name.replace("/", "_")
turbine_tank.uploadToBlobStorage(
str(os.path.abspath(f"{safe_name}.mlir")),
f"{model_name_upload}/{model_name_upload}.mlir",
)
if compile_to != "vmfb":
return module_str, tokenizer
else:
Expand Down Expand Up @@ -395,6 +413,7 @@ def evict_kvcache_space(self):
args.vulkan_max_allocation,
args.streaming_llm,
args.vmfb_path,
args.upload_ir,
)
safe_name = args.hf_model_name.split("/")[-1].strip()
safe_name = re.sub("-", "_", safe_name)
Expand Down
34 changes: 31 additions & 3 deletions models/turbine_models/model_builder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from transformers import AutoModel, AutoTokenizer, AutoConfig
import torch
import shark_turbine.aot as aot
from turbine_models.turbine_tank import turbine_tank
import os
import re


class HFTransformerBuilder:
Expand All @@ -23,16 +26,24 @@ def __init__(
auto_tokenizer: AutoTokenizer = None,
auto_config: AutoConfig = None,
hf_auth_token=None,
upload_ir=False,
model=None,
model_type: str = None,
run_e2e: bool = None,
) -> None:
self.example_input = example_input
self.hf_id = hf_id
self.auto_model = auto_model
self.auto_tokenizer = auto_tokenizer
self.auto_config = auto_config
self.hf_auth_token = hf_auth_token
self.model = None
self.model = model
self.tokenizer = None
self.build_model()
self.upload_ir = upload_ir
self.model_type = model_type
self.run_e2e = run_e2e
if self.model == None:
self.build_model()

def build_model(self) -> None:
"""
Expand All @@ -59,6 +70,23 @@ def get_compiled_module(self, save_to: str = None) -> aot.CompiledModule:
Returns:
aot.CompiledModule: The compiled module binary.
"""
module = aot.export(self.model, self.example_input)
if self.model_type == "hf_seq2seq":
module = aot.export(self.model, *self.example_input)
else:
module = aot.export(self.model, self.example_input)
module_str = str(module.mlir_module)
safe_name = self.hf_id.split("/")[-1].strip()
safe_name = re.sub("-", "_", safe_name)
if self.upload_ir:
with open(f"{safe_name}.mlir", "w+") as f:
f.write(module_str)
model_name_upload = self.hf_id.replace("/", "_")
turbine_tank.uploadToBlobStorage(
str(os.path.abspath(f"{safe_name}.mlir")),
f"{model_name_upload}/{model_name_upload}.mlir",
)
os.remove(f"{safe_name}.mlir")
if self.run_e2e is not None and self.run_e2e is False:
return
compiled_binary = module.compile(save_to=save_to)
return compiled_binary
8 changes: 8 additions & 0 deletions models/turbine_models/tests/sd_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@

class StableDiffusionTest(unittest.TestCase):
def testExportClipModel(self):
upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
with self.assertRaises(SystemExit) as cm:
clip.export_clip_model(
# This is a public model, so no auth required
Expand All @@ -64,6 +65,7 @@ def testExportClipModel(self):
"safetensors",
"stable_diffusion_v1_4_clip.safetensors",
"cpu",
upload_ir=upload_ir_var == "upload",
)
self.assertEqual(cm.exception.code, None)
arguments["external_weight_path"] = "stable_diffusion_v1_4_clip.safetensors"
Expand All @@ -85,6 +87,7 @@ def testExportClipModel(self):
os.remove("stable_diffusion_v1_4_clip.vmfb")

def testExportUnetModel(self):
upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
with self.assertRaises(SystemExit) as cm:
unet.export_unet_model(
unet_model,
Expand All @@ -98,6 +101,7 @@ def testExportUnetModel(self):
"safetensors",
"stable_diffusion_v1_4_unet.safetensors",
"cpu",
upload_ir=upload_ir_var == "upload",
)
self.assertEqual(cm.exception.code, None)
arguments["external_weight_path"] = "stable_diffusion_v1_4_unet.safetensors"
Expand Down Expand Up @@ -135,6 +139,7 @@ def testExportUnetModel(self):
os.remove("stable_diffusion_v1_4_unet.vmfb")

def testExportVaeModelDecode(self):
upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
with self.assertRaises(SystemExit) as cm:
vae.export_vae_model(
vae_model,
Expand All @@ -149,6 +154,7 @@ def testExportVaeModelDecode(self):
"stable_diffusion_v1_4_vae.safetensors",
"cpu",
variant="decode",
upload_ir=upload_ir_var == "upload",
)
self.assertEqual(cm.exception.code, None)
arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
Expand Down Expand Up @@ -180,6 +186,7 @@ def testExportVaeModelDecode(self):
os.remove("stable_diffusion_v1_4_vae.vmfb")

def testExportVaeModelEncode(self):
upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")
with self.assertRaises(SystemExit) as cm:
vae.export_vae_model(
vae_model,
Expand All @@ -194,6 +201,7 @@ def testExportVaeModelEncode(self):
"stable_diffusion_v1_4_vae.safetensors",
"cpu",
variant="encode",
upload_ir=upload_ir_var == "upload",
)
self.assertEqual(cm.exception.code, None)
arguments["external_weight_path"] = "stable_diffusion_v1_4_vae.safetensors"
Expand Down
3 changes: 3 additions & 0 deletions models/turbine_models/tests/stateless_llama_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def test_vmfb_comparison(self):
For VMFB, quantization can be int4 or None, but right now only using none for compatibility with torch.
"""

upload_ir_var = os.environ.get("TURBINE_TANK_ACTION", "not_upload")

llama.export_transformer_model(
hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
hf_auth_token=None,
Expand All @@ -63,6 +65,7 @@ def test_vmfb_comparison(self):
precision=precision,
device="llvm-cpu",
target_triple="host",
upload_ir=upload_ir_var == "upload",
)

torch_str_cache_path = f"models/turbine_models/tests/vmfb_comparison_cached_torch_output_{precision}_{quantization}.txt"
Expand Down
Loading
Loading