Improve: PAss tests for small models

ashvardanian · Apr 20, 2024 · b790519 · b790519
1 parent cccfc62
commit b790519
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 75 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -21,7 +21,9 @@
         "ndarray",
         "numpy",
         "ONNX",
+        "onnxconverter",
         "onnxruntime",
+        "opset",
         "packbits",
         "preprocess",
         "pretrained",
@@ -48,5 +50,8 @@
     "[python]": {
         "editor.defaultFormatter": "ms-python.black-formatter"
     },
-    "python.formatting.provider": "none"
+    "python.formatting.provider": "none",
+    "window.autoDetectColorScheme": true,
+    "workbench.colorTheme": "Default Dark+",
+    "workbench.preferredDarkColorTheme": "Default Dark+"
 }
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
@@ -19,7 +19,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip uninstall -y uform\n",
     "!pip install --upgrade \"uform[torch]\" coremltools"
    ]
   },
@@ -30,8 +29,13 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "model_name = \"uform-vl-english-small\"\n",
-    "output_directory = \"../../\""
+    "\n",
+    "working_directory = \"../..\"\n",
+    "model_name = \"uform3-image-text-english-small\"\n",
+    "model_directory = os.path.join(working_directory, \"models\", model_name)\n",
+    "model_weights_path = os.path.join(model_directory, \"torch_weight.pt\")\n",
+    "config_path = os.path.join(model_directory, \"config.json\")\n",
+    "tokenizer_path = os.path.join(model_directory, \"tokenizer.json\")"
    ]
   },
   {
@@ -40,20 +44,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import uform\n",
-    "from PIL import Image\n",
-    "\n",
-    "model, processor = uform.get_model('unum-cloud/' + model_name)\n",
-    "text = 'a small red panda in a zoo'\n",
-    "image = Image.open('../../assets/unum.png')\n",
-    "\n",
-    "image_data = processor.preprocess_image(image)\n",
-    "text_data = processor.preprocess_text(text)\n",
-    "\n",
-    "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
-    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "import torch\n",
     "\n",
-    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+    "state_dict = torch.load(model_weights_path)\n",
+    "list(state_dict.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from uform.torch_encoders import ImageEncoder, TextEncoder\n",
+    "from uform.torch_processors import ImageProcessor, TextProcessor"
    ]
   },
   {
@@ -62,7 +66,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.text_encoder"
+    "image_encoder = ImageEncoder.from_pretrained(config_path, state_dict)\n",
+    "text_encoder = TextEncoder.from_pretrained(config_path, state_dict)\n",
+    "image_encoder, text_encoder"
    ]
   },
   {
@@ -71,7 +77,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.image_encoder"
+    "text_processor = TextProcessor(config_path, tokenizer_path)\n",
+    "image_processor = ImageProcessor(config_path)\n",
+    "text_processor, image_processor"
    ]
   },
   {
@@ -80,14 +88,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
-    "for name, module in model.image_encoder.named_children():\n",
-    "    print(f\"First layer of image_encoder: {name}\")\n",
-    "    break  # We break after the first layer\n",
+    "import uform\n",
+    "from PIL import Image\n",
     "\n",
-    "for name, module in model.text_encoder.named_children():\n",
-    "    print(f\"First layer of text_encoder: {name}\")\n",
-    "    break  # We break after the first layer"
+    "text = 'a small red panda in a zoo'\n",
+    "image = Image.open('../../assets/unum.png')\n",
+    "\n",
+    "text_data = text_processor(text)\n",
+    "image_data = image_processor(image)\n",
+    "\n",
+    "image_features, image_embedding = image_encoder.forward(image_data, return_features=True)\n",
+    "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
    ]
   },
   {
@@ -147,7 +160,7 @@
     "    input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n",
     "    return input_shape\n",
     "\n",
-    "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
+    "generalize_first_dimensions(image_data[\"images\"].shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
    ]
   },
   {
@@ -156,7 +169,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
+    "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data[\"images\"].shape, 1))\n",
     "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
     "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
     "text_features = ct.TensorType(name=\"features\")\n",
@@ -171,11 +184,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module = model.image_encoder\n",
+    "module = image_encoder\n",
     "module.eval()\n",
     "module.return_features = True\n",
     "\n",
-    "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=image_data[\"images\"])\n",
     "traced_script_module"
    ]
   },
@@ -193,7 +206,7 @@
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))"
+    "coreml_model.save(os.path.join(model_directory, \"image_encoder.mlpackage\"))"
    ]
   },
   {
@@ -202,7 +215,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module = model.text_encoder\n",
+    "module = text_encoder\n",
     "module.eval()\n",
     "module.return_features = True\n",
     "\n",
@@ -224,7 +237,7 @@
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))"
+    "coreml_model.save(os.path.join(model_directory, \"text_encoder.mlpackage\"))"
    ]
   },
   {
@@ -257,8 +270,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.image_encoder.eval()\n",
-    "model.image_encoder.to(dtype=torch.bfloat16)"
+    "image_encoder.eval()\n",
+    "image_encoder.to(dtype=torch.bfloat16)"
    ]
   },
   {
@@ -267,7 +280,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))"
+    "torch.save(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.pt\"))"
    ]
   },
   {
@@ -276,7 +289,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))"
+    "save_file(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.safetensors\"))"
    ]
   },
   {
@@ -285,8 +298,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.text_encoder.eval()\n",
-    "model.text_encoder.to(dtype=torch.bfloat16)"
+    "text_encoder.eval()\n",
+    "text_encoder.to(dtype=torch.bfloat16)"
    ]
   },
   {
@@ -295,7 +308,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))"
+    "torch.save(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.pt\"))"
    ]
   },
   {
@@ -304,7 +317,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))"
+    "save_file(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.safetensors\"))"
    ]
   },
   {
@@ -313,8 +326,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
-    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "image_features, image_embedding = image_encoder.forward(image_data[\"images\"].to(dtype=torch.bfloat16), return_features=True)\n",
+    "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
     "\n",
     "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
    ]
@@ -358,15 +371,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module = model.text_encoder\n",
+    "module = text_encoder\n",
     "module.eval()\n",
     "module.return_features = True\n",
     "module.to(dtype=torch.float32)\n",
     "\n",
     "onnx_export(\n",
     "    module,\n",
     "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
-    "    os.path.join(output_directory, \"text_encoder.onnx\"), \n",
+    "    os.path.join(model_directory, \"text_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -392,15 +405,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module = model.image_encoder\n",
+    "module = image_encoder\n",
     "module.eval()\n",
     "module.return_features = True\n",
     "module.to(dtype=torch.float32)\n",
     "\n",
     "torch.onnx.export(\n",
     "    module,\n",
-    "    image_data, \n",
-    "    os.path.join(output_directory, \"image_encoder.onnx\"), \n",
+    "    image_data[\"images\"], \n",
+    "    os.path.join(model_directory, \"image_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -437,7 +450,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
     "module = onnx.load(module_path)\n",
     "module_fp16 = float16.convert_float_to_float16(module)\n",
     "onnx.save(module_fp16, module_path)"
@@ -449,7 +462,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
     "module = onnx.load(module_path)\n",
     "module_fp16 = float16.convert_float_to_float16(module)\n",
     "onnx.save(module_fp16, module_path)"
@@ -480,7 +493,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
     "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
    ]
   },
@@ -490,7 +503,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
     "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
    ]
   },
@@ -512,7 +525,7 @@
     "from onnx import helper\n",
     "\n",
     "# Load the ONNX model\n",
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
     "module = onnx.load(module_path)\n",
     "\n",
     "# Get the module's graph\n",
@@ -599,7 +612,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
     "session = ort.InferenceSession(module_path, sess_options=session_options)"
    ]
   },
@@ -609,7 +622,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
     "session = ort.InferenceSession(module_path, sess_options=session_options)"
    ]
   },
@@ -620,6 +633,15 @@
     "# Upload to Hugging Face"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../models/uform3-image-text-english-small/ . --exclude=\"torch_weight.pt\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,