Skip to content

Commit

Permalink
Improve: PAss tests for small models
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Apr 20, 2024
1 parent cccfc62 commit b790519
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 75 deletions.
7 changes: 6 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
"ndarray",
"numpy",
"ONNX",
"onnxconverter",
"onnxruntime",
"opset",
"packbits",
"preprocess",
"pretrained",
Expand All @@ -48,5 +50,8 @@
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "none"
"python.formatting.provider": "none",
"window.autoDetectColorScheme": true,
"workbench.colorTheme": "Default Dark+",
"workbench.preferredDarkColorTheme": "Default Dark+"
}
130 changes: 76 additions & 54 deletions python/scripts/export_encoders.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"metadata": {},
"outputs": [],
"source": [
"!pip uninstall -y uform\n",
"!pip install --upgrade \"uform[torch]\" coremltools"
]
},
Expand All @@ -30,8 +29,13 @@
"outputs": [],
"source": [
"import os\n",
"model_name = \"uform-vl-english-small\"\n",
"output_directory = \"../../\""
"\n",
"working_directory = \"../..\"\n",
"model_name = \"uform3-image-text-english-small\"\n",
"model_directory = os.path.join(working_directory, \"models\", model_name)\n",
"model_weights_path = os.path.join(model_directory, \"torch_weight.pt\")\n",
"config_path = os.path.join(model_directory, \"config.json\")\n",
"tokenizer_path = os.path.join(model_directory, \"tokenizer.json\")"
]
},
{
Expand All @@ -40,20 +44,20 @@
"metadata": {},
"outputs": [],
"source": [
"import uform\n",
"from PIL import Image\n",
"\n",
"model, processor = uform.get_model('unum-cloud/' + model_name)\n",
"text = 'a small red panda in a zoo'\n",
"image = Image.open('../../assets/unum.png')\n",
"\n",
"image_data = processor.preprocess_image(image)\n",
"text_data = processor.preprocess_text(text)\n",
"\n",
"image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
"text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
"import torch\n",
"\n",
"image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
"state_dict = torch.load(model_weights_path)\n",
"list(state_dict.keys())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from uform.torch_encoders import ImageEncoder, TextEncoder\n",
"from uform.torch_processors import ImageProcessor, TextProcessor"
]
},
{
Expand All @@ -62,7 +66,9 @@
"metadata": {},
"outputs": [],
"source": [
"model.text_encoder"
"image_encoder = ImageEncoder.from_pretrained(config_path, state_dict)\n",
"text_encoder = TextEncoder.from_pretrained(config_path, state_dict)\n",
"image_encoder, text_encoder"
]
},
{
Expand All @@ -71,7 +77,9 @@
"metadata": {},
"outputs": [],
"source": [
"model.image_encoder"
"text_processor = TextProcessor(config_path, tokenizer_path)\n",
"image_processor = ImageProcessor(config_path)\n",
"text_processor, image_processor"
]
},
{
Expand All @@ -80,14 +88,19 @@
"metadata": {},
"outputs": [],
"source": [
"# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
"for name, module in model.image_encoder.named_children():\n",
" print(f\"First layer of image_encoder: {name}\")\n",
" break # We break after the first layer\n",
"import uform\n",
"from PIL import Image\n",
"\n",
"for name, module in model.text_encoder.named_children():\n",
" print(f\"First layer of text_encoder: {name}\")\n",
" break # We break after the first layer"
"text = 'a small red panda in a zoo'\n",
"image = Image.open('../../assets/unum.png')\n",
"\n",
"text_data = text_processor(text)\n",
"image_data = image_processor(image)\n",
"\n",
"image_features, image_embedding = image_encoder.forward(image_data, return_features=True)\n",
"text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
"\n",
"image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
]
},
{
Expand Down Expand Up @@ -147,7 +160,7 @@
" input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n",
" return input_shape\n",
"\n",
"generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
"generalize_first_dimensions(image_data[\"images\"].shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
]
},
{
Expand All @@ -156,7 +169,7 @@
"metadata": {},
"outputs": [],
"source": [
"image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
"image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data[\"images\"].shape, 1))\n",
"text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
"text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
"text_features = ct.TensorType(name=\"features\")\n",
Expand All @@ -171,11 +184,11 @@
"metadata": {},
"outputs": [],
"source": [
"module = model.image_encoder\n",
"module = image_encoder\n",
"module.eval()\n",
"module.return_features = True\n",
"\n",
"traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
"traced_script_module = torch.jit.trace(module, example_inputs=image_data[\"images\"])\n",
"traced_script_module"
]
},
Expand All @@ -193,7 +206,7 @@
"coreml_model.author = 'Unum Cloud'\n",
"coreml_model.license = 'Apache 2.0'\n",
"coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
"coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))"
"coreml_model.save(os.path.join(model_directory, \"image_encoder.mlpackage\"))"
]
},
{
Expand All @@ -202,7 +215,7 @@
"metadata": {},
"outputs": [],
"source": [
"module = model.text_encoder\n",
"module = text_encoder\n",
"module.eval()\n",
"module.return_features = True\n",
"\n",
Expand All @@ -224,7 +237,7 @@
"coreml_model.author = 'Unum Cloud'\n",
"coreml_model.license = 'Apache 2.0'\n",
"coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
"coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))"
"coreml_model.save(os.path.join(model_directory, \"text_encoder.mlpackage\"))"
]
},
{
Expand Down Expand Up @@ -257,8 +270,8 @@
"metadata": {},
"outputs": [],
"source": [
"model.image_encoder.eval()\n",
"model.image_encoder.to(dtype=torch.bfloat16)"
"image_encoder.eval()\n",
"image_encoder.to(dtype=torch.bfloat16)"
]
},
{
Expand All @@ -267,7 +280,7 @@
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))"
"torch.save(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.pt\"))"
]
},
{
Expand All @@ -276,7 +289,7 @@
"metadata": {},
"outputs": [],
"source": [
"save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))"
"save_file(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.safetensors\"))"
]
},
{
Expand All @@ -285,8 +298,8 @@
"metadata": {},
"outputs": [],
"source": [
"model.text_encoder.eval()\n",
"model.text_encoder.to(dtype=torch.bfloat16)"
"text_encoder.eval()\n",
"text_encoder.to(dtype=torch.bfloat16)"
]
},
{
Expand All @@ -295,7 +308,7 @@
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))"
"torch.save(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.pt\"))"
]
},
{
Expand All @@ -304,7 +317,7 @@
"metadata": {},
"outputs": [],
"source": [
"save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))"
"save_file(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.safetensors\"))"
]
},
{
Expand All @@ -313,8 +326,8 @@
"metadata": {},
"outputs": [],
"source": [
"image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
"text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
"image_features, image_embedding = image_encoder.forward(image_data[\"images\"].to(dtype=torch.bfloat16), return_features=True)\n",
"text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
"\n",
"image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
]
Expand Down Expand Up @@ -358,15 +371,15 @@
"metadata": {},
"outputs": [],
"source": [
"module = model.text_encoder\n",
"module = text_encoder\n",
"module.eval()\n",
"module.return_features = True\n",
"module.to(dtype=torch.float32)\n",
"\n",
"onnx_export(\n",
" module,\n",
" (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
" os.path.join(output_directory, \"text_encoder.onnx\"), \n",
" os.path.join(model_directory, \"text_encoder.onnx\"), \n",
" export_params=True,\n",
" opset_version=15,\n",
" do_constant_folding=True,\n",
Expand All @@ -392,15 +405,15 @@
"metadata": {},
"outputs": [],
"source": [
"module = model.image_encoder\n",
"module = image_encoder\n",
"module.eval()\n",
"module.return_features = True\n",
"module.to(dtype=torch.float32)\n",
"\n",
"torch.onnx.export(\n",
" module,\n",
" image_data, \n",
" os.path.join(output_directory, \"image_encoder.onnx\"), \n",
" image_data[\"images\"], \n",
" os.path.join(model_directory, \"image_encoder.onnx\"), \n",
" export_params=True,\n",
" opset_version=15,\n",
" do_constant_folding=True,\n",
Expand Down Expand Up @@ -437,7 +450,7 @@
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
"module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
"module = onnx.load(module_path)\n",
"module_fp16 = float16.convert_float_to_float16(module)\n",
"onnx.save(module_fp16, module_path)"
Expand All @@ -449,7 +462,7 @@
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
"module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
"module = onnx.load(module_path)\n",
"module_fp16 = float16.convert_float_to_float16(module)\n",
"onnx.save(module_fp16, module_path)"
Expand Down Expand Up @@ -480,7 +493,7 @@
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
"module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
"quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
]
},
Expand All @@ -490,7 +503,7 @@
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
"module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
"quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
]
},
Expand All @@ -512,7 +525,7 @@
"from onnx import helper\n",
"\n",
"# Load the ONNX model\n",
"module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
"module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
"module = onnx.load(module_path)\n",
"\n",
"# Get the module's graph\n",
Expand Down Expand Up @@ -599,7 +612,7 @@
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
"module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
"session = ort.InferenceSession(module_path, sess_options=session_options)"
]
},
Expand All @@ -609,7 +622,7 @@
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
"module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
"session = ort.InferenceSession(module_path, sess_options=session_options)"
]
},
Expand All @@ -620,6 +633,15 @@
"# Upload to Hugging Face"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../models/uform3-image-text-english-small/ . --exclude=\"torch_weight.pt\""
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading

0 comments on commit b790519

Please sign in to comment.