Break: Deprecate old ONNX structure

ashvardanian · Apr 17, 2024 · 94ebd6e · 94ebd6e
1 parent 38949f3
commit 94ebd6e
Show file tree

Hide file tree

Showing 14 changed files with 430 additions and 240 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ node_modules
 *.onnx
 *.pt
 *.safetensors
+*.mlpackage
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,8 +1,10 @@
 {
     "cSpell.words": [
         "arange",
+        "astype",
         "CFURL",
         "coreml",
+        "crossattn",
         "cumsum",
         "dtype",
         "embs",
@@ -25,12 +27,17 @@
         "pretrained",
         "probs",
         "pypi",
+        "pytest",
+        "randn",
         "rerank",
         "reranker",
         "reranking",
+        "sandbeach",
         "sess",
         "SIMD",
         "softmax",
+        "Tensorrt",
+        "torchvision",
         "transfromers",
         "uform",
         "unimodal",

diff --git a/Package.swift b/Package.swift
@@ -29,13 +29,13 @@ let package = Package(
                 .product(name: "Transformers", package: "swift-transformers")
             ],
             path: "swift",
-            exclude: ["EmbeddingsTests.swift"]
+            exclude: ["EncodersTests.swift"]
         ),
         .testTarget(
             name: "UFormTests",
             dependencies: ["UForm"],
             path: "swift",
-            sources: ["EmbeddingsTests.swift"]
+            sources: ["EncodersTests.swift"]
         ),
     ]
 )
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,8 @@ classifiers = [
 dependencies = [
     "huggingface_hub>=0.16.4",
     "tokenizers>=0.13.3",
-    "pillow"
+    "pillow",
+    "simsimd",
 ]
 description = "Pocket-Sized Multimodal AI for Content Understanding and Generation"
 maintainers = [

diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
@@ -4,7 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
+    "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
+    "\n",
+    "Depending on the backend, we prefer different qunatization schemes.\n",
+    "\n",
+    "- For ONNX we use `int8` quantization.\n",
+    "- For PyTorch we use `bfloat16` quantization.\n",
+    "- For CoreML we use `float32` representation."
    ]
   },
   {
@@ -181,12 +187,12 @@
     "coreml_model = ct.convert(\n",
     "    traced_script_module, source=\"pytorch\",\n",
     "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision)\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
     "\n",
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
+    "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))"
    ]
   },
   {
@@ -217,7 +223,7 @@
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+    "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))"
    ]
   },
   {
@@ -260,7 +266,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.image_encoder.state_dict(), 'image.pt')"
+    "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))"
    ]
   },
   {
@@ -269,7 +275,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.image_encoder.state_dict(), \"image.safetensors\")"
+    "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))"
    ]
   },
   {
@@ -288,7 +294,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.text_encoder.state_dict(), 'text.pt')"
+    "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))"
    ]
   },
   {
@@ -297,7 +303,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.text_encoder.state_dict(), \"text.safetensors\")"
+    "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))"
    ]
   },
   {
@@ -312,26 +318,6 @@
     "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -354,7 +340,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from torch.onnx import export as onnx_export"
+    "from torch.onnx import export as onnx_export\n",
+    "import torch"
    ]
   },
   {
@@ -378,7 +365,7 @@
     "onnx_export(\n",
     "    module,\n",
     "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
-    "    \"text.onnx\", \n",
+    "    os.path.join(output_directory, \"text_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -391,27 +378,6 @@
     "        'embeddings' : {0 : 'batch_size'}})"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnx\n",
-    "from onnxconverter_common import float16\n",
-    "\n",
-    "module = onnx.load(\"text.onnx\")\n",
-    "module_fp16 = float16.convert_float_to_float16(module)\n",
-    "onnx.save(module_fp16, \"text.onnx\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -433,7 +399,7 @@
     "torch.onnx.export(\n",
     "    module,\n",
     "    image_data, \n",
-    "    \"image.onnx\", \n",
+    "    os.path.join(output_directory, \"image_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -445,18 +411,131 @@
     "        'embeddings' : {0 : 'batch_size'}})"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `float16`\n",
+    "\n",
+    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import onnx\n",
-    "from onnxconverter_common import float16\n",
-    "\n",
-    "module = onnx.load(\"image.onnx\")\n",
+    "from onnxconverter_common import float16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
     "module_fp16 = float16.convert_float_to_float16(module)\n",
-    "onnx.save(module_fp16, \"image.onnx\")"
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `uint8`\n",
+    "\n",
+    "We can further quantize the model into `uint8` using ONNX quantization tools.\n",
+    "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from onnxruntime.quantization import quantize_dynamic, QuantType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's check that the runtime can actually load those models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnxruntime as ort\n",
+    "session_options = ort.SessionOptions()\n",
+    "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Upload to Hugging Face"
    ]
   },
   {
@@ -465,8 +544,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx"
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
    ]
   }
  ],
@@ -486,7 +569,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,