diff --git a/.gitignore b/.gitignore
index 4db8e17..f4fa33b 100755
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ node_modules
 *.onnx
 *.pt
 *.safetensors
+*.mlpackage
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a6cceb8..5052dea 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,8 +1,10 @@
 {
     "cSpell.words": [
         "arange",
+        "astype",
         "CFURL",
         "coreml",
+        "crossattn",
         "cumsum",
         "dtype",
         "embs",
@@ -25,12 +27,17 @@
         "pretrained",
         "probs",
         "pypi",
+        "pytest",
+        "randn",
         "rerank",
         "reranker",
         "reranking",
+        "sandbeach",
         "sess",
         "SIMD",
         "softmax",
+        "Tensorrt",
+        "torchvision",
         "transfromers",
         "uform",
         "unimodal",
diff --git a/Package.swift b/Package.swift
index 6ac8372..b3b9ffd 100644
--- a/Package.swift
+++ b/Package.swift
@@ -29,13 +29,13 @@ let package = Package(
                 .product(name: "Transformers", package: "swift-transformers")
             ],
             path: "swift",
-            exclude: ["EmbeddingsTests.swift"]
+            exclude: ["EncodersTests.swift"]
         ),
         .testTarget(
             name: "UFormTests",
             dependencies: ["UForm"],
             path: "swift",
-            sources: ["EmbeddingsTests.swift"]
+            sources: ["EncodersTests.swift"]
         ),
     ]
 )
diff --git a/pyproject.toml b/pyproject.toml
index 10f7a9b..1a84808 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,8 @@ classifiers = [
 dependencies = [
     "huggingface_hub>=0.16.4",
     "tokenizers>=0.13.3",
-    "pillow"
+    "pillow",
+    "simsimd",
 ]
 description = "Pocket-Sized Multimodal AI for Content Understanding and Generation"
 maintainers = [
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
index df57858..0ec6e8a 100644
--- a/python/scripts/export_encoders.ipynb
+++ b/python/scripts/export_encoders.ipynb
@@ -4,7 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
+    "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
+    "\n",
+    "Depending on the backend, we prefer different qunatization schemes.\n",
+    "\n",
+    "- For ONNX we use `int8` quantization.\n",
+    "- For PyTorch we use `bfloat16` quantization.\n",
+    "- For CoreML we use `float32` representation."
    ]
   },
   {
@@ -18,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,9 +35,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n",
+      "  Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n",
+      "  Expected in:     <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n",
+      "  warn(f\"Failed to load image Python extension: {e}\")\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4bf831f84cb4a88bc5fe4aa4487b241",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(torch.Size([1, 197, 384]),\n",
+       " torch.Size([1, 64, 768]),\n",
+       " torch.Size([1, 256]),\n",
+       " torch.Size([1, 256]))"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import uform\n",
     "from PIL import Image\n",
@@ -181,12 +225,12 @@
     "coreml_model = ct.convert(\n",
     "    traced_script_module, source=\"pytorch\",\n",
     "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision)\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
     "\n",
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
+    "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))"
    ]
   },
   {
@@ -217,7 +261,7 @@
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+    "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))"
    ]
   },
   {
@@ -260,7 +304,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.image_encoder.state_dict(), 'image.pt')"
+    "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))"
    ]
   },
   {
@@ -269,7 +313,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.image_encoder.state_dict(), \"image.safetensors\")"
+    "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))"
    ]
   },
   {
@@ -288,7 +332,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.text_encoder.state_dict(), 'text.pt')"
+    "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))"
    ]
   },
   {
@@ -297,7 +341,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.text_encoder.state_dict(), \"text.safetensors\")"
+    "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))"
    ]
   },
   {
@@ -312,26 +356,6 @@
     "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -350,11 +374,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from torch.onnx import export as onnx_export"
+    "from torch.onnx import export as onnx_export\n",
+    "import torch"
    ]
   },
   {
@@ -366,7 +391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -378,7 +403,7 @@
     "onnx_export(\n",
     "    module,\n",
     "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
-    "    \"text.onnx\", \n",
+    "    os.path.join(output_directory, \"text_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -391,27 +416,6 @@
     "        'embeddings' : {0 : 'batch_size'}})"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnx\n",
-    "from onnxconverter_common import float16\n",
-    "\n",
-    "module = onnx.load(\"text.onnx\")\n",
-    "module_fp16 = float16.convert_float_to_float16(module)\n",
-    "onnx.save(module_fp16, \"text.onnx\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -421,7 +425,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -433,7 +437,7 @@
     "torch.onnx.export(\n",
     "    module,\n",
     "    image_data, \n",
-    "    \"image.onnx\", \n",
+    "    os.path.join(output_directory, \"image_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -445,6 +449,15 @@
     "        'embeddings' : {0 : 'batch_size'}})"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `float16`\n",
+    "\n",
+    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -452,11 +465,131 @@
    "outputs": [],
    "source": [
     "import onnx\n",
-    "from onnxconverter_common import float16\n",
-    "\n",
-    "module = onnx.load(\"image.onnx\")\n",
+    "from onnxconverter_common import float16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
     "module_fp16 = float16.convert_float_to_float16(module)\n",
-    "onnx.save(module_fp16, \"image.onnx\")"
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `uint8`\n",
+    "\n",
+    "We can further quantize the model into `uint8` using ONNX quantization tools.\n",
+    "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from onnxruntime.quantization import quantize_dynamic, QuantType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:Please consider to run pre-processing before quantization. Refer to example: https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification/cpu/ReadMe.md \n"
+     ]
+    }
+   ],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:Please consider to run pre-processing before quantization. Refer to example: https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification/cpu/ReadMe.md \n"
+     ]
+    }
+   ],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's check that the runtime can actually load those models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnxruntime as ort\n",
+    "session_options = ort.SessionOptions()\n",
+    "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Upload to Hugging Face"
    ]
   },
   {
@@ -465,8 +598,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx"
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
    ]
   }
  ],
@@ -486,7 +623,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index e7541c1..a58544d 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -1,8 +1,12 @@
 from typing import Tuple
+import requests
+from io import BytesIO
 import os
 
 import pytest
+import numpy as np
 from PIL import Image
+
 import uform
 
 # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
@@ -22,18 +26,13 @@
     onnx_available = False
 
 torch_models = [
-    "unum-cloud/uform2-vl-english-small",
+    "unum-cloud/uform3-image-text-english-small",
     "unum-cloud/uform-vl-english",
     "unum-cloud/uform-vl-multilingual-v2",
 ]
 
-onnx_models_and_providers = [
-    ("unum-cloud/uform-vl-english-small", "cpu", "fp32"),
-    ("unum-cloud/uform-vl-english-large", "cpu", "fp32"),
-    ("unum-cloud/uform-vl-english-small", "gpu", "fp32"),
-    ("unum-cloud/uform-vl-english-large", "gpu", "fp32"),
-    ("unum-cloud/uform-vl-english-small", "gpu", "fp16"),
-    ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
+onnx_models = [
+    "unum-cloud/uform3-image-text-english-small",
 ]
 
 # Let's check if the HuggingFace Hub API token is set in the environment variable.
@@ -46,6 +45,71 @@
             token = file.read().strip()
 
 
+def cosine_similarity(x, y) -> float:
+    if not isinstance(x, np.ndarray):
+        x = x.detach().numpy()
+    if not isinstance(y, np.ndarray):
+        y = y.detach().numpy()
+
+    # Unlike NumPy, SimSIMD can properly deal with integer types
+    x = x.astype(np.float32).flatten()
+    y = y.astype(np.float32).flatten()
+    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
+
+
+def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding):
+    """Test if the embeddings of text and image are semantically similar
+    using a small set of example text-image pairs."""
+
+    texts = [
+        "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
+        "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
+        "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+        "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
+        "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
+    ]
+
+    image_urls = [
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
+    ]
+
+    text_embeddings = []
+    image_embeddings = []
+
+    for text, image_url in zip(texts, image_urls):
+        # Download and open the image
+        response = requests.get(image_url)
+        image = Image.open(BytesIO(response.content))
+
+        # Get embeddings
+        text_embedding = text_to_embedding(text)
+        image_embedding = image_to_embedding(image)
+
+        text_embeddings.append(text_embedding)
+        image_embeddings.append(image_embedding)
+
+    # Evaluate cosine similarity
+    for i in range(len(texts)):
+        pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i])
+        other_text_similarities = [
+            cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(len(texts)) if j != i
+        ]
+        other_image_similarities = [
+            cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(len(texts)) if j != i
+        ]
+
+        assert pair_similarity > max(
+            other_text_similarities
+        ), "Text should be more similar to its corresponding image than to other images."
+        assert pair_similarity > max(
+            other_image_similarities
+        ), "Image should be more similar to its corresponding text than to other texts."
+
+
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
 def test_torch_one_embedding(model_name: str):
@@ -73,6 +137,12 @@ def test_torch_one_embedding(model_name: str):
     assert score.shape[0] == 1, "Matching score batch size is not 1"
     assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
 
+    # Test if the model outputs actually make sense
+    cross_references_image_and_text_embeddings(
+        lambda text: model.encode_text(processor.preprocess_text(text)),
+        lambda image: model.encode_image(processor.preprocess_image(image)),
+    )
+
 
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
@@ -94,14 +164,15 @@ def test_torch_many_embeddings(model_name: str, batch_size: int):
 
 
 @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
-@pytest.mark.parametrize("model_specs", onnx_models_and_providers)
-def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
+@pytest.mark.parametrize("model_name", onnx_models)
+@pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+def test_onnx_one_embedding(model_name: str, device: str):
 
-    from uform.onnx_models import ExecutionProviderError
+    from uform.onnx_encoders import ExecutionProviderError
 
     try:
 
-        model, processor = uform.get_model_onnx(*model_specs, token=token)
+        model, processor = uform.get_model_onnx(model_name, token=token, device=device)
         text = "a small red panda in a zoo"
         image_path = "assets/unum.png"
 
@@ -115,29 +186,27 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
         assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
         assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
 
-        score, joint_embedding = model.encode_multimodal(
-            image_features=image_features,
-            text_features=text_features,
-            attention_mask=text_data["attention_mask"],
-            return_scores=True,
+        # Test if the model outputs actually make sense
+        cross_references_image_and_text_embeddings(
+            lambda text: model.encode_text(processor.preprocess_text(text)),
+            lambda image: model.encode_image(processor.preprocess_image(image)),
         )
-        assert score.shape[0] == 1, "Matching score batch size is not 1"
-        assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
 
     except ExecutionProviderError as e:
         pytest.skip(f"Execution provider error: {e}")
 
 
 @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
-@pytest.mark.parametrize("model_specs", onnx_models_and_providers)
+@pytest.mark.parametrize("model_name", onnx_models)
 @pytest.mark.parametrize("batch_size", [1, 2])
-def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int):
+@pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
 
-    from uform.onnx_models import ExecutionProviderError
+    from uform.onnx_encoders import ExecutionProviderError
 
     try:
 
-        model, processor = uform.get_model_onnx(*model_specs, token=token)
+        model, processor = uform.get_model_onnx(model_name, token=token, device=device)
         texts = ["a small red panda in a zoo"] * batch_size
         image_paths = ["assets/unum.png"] * batch_size
 
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index f5a15c2..44fce13 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,6 +1,6 @@
 from json import load
 from os.path import join, exists
-from typing import Mapping, Optional, Tuple
+from typing import Dict, Optional, Tuple, Literal
 from enum import Enum
 
 from huggingface_hub import snapshot_download
@@ -9,15 +9,38 @@
 class Modality(Enum):
     TEXT_ENCODER = "text_encoder"
     IMAGE_ENCODER = "image_encoder"
+    VIDEO_ENCODER = "video_encoder"
+    TEXT_DECODER = "text_decoder"
 
 
-def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]:
-    import torch
+def normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
+    if modalities is None:
+        return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER)
+
+    return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities)
+
+
+def get_checkpoint(
+    model_name: str,
+    modalities: Tuple[str, Modality],
+    token: Optional[str] = None,
+    format: Literal[".pt", ".onnx"] = ".pt",
+) -> Tuple[str, Dict[Modality, str], Optional[str]]:
+    """Downloads a model checkpoint from the Hugging Face Hub.
+
+    :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small`
+    :param token: The Hugging Face API token, if required
+    :param modalities: The modalities to download, like `("text_encoder", "image_encoder")`
+    :param format: The format of the model checkpoint, either `.pt` or `.onnx`
+    :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path
+    """
+
+    modalities = normalize_modalities(modalities)
 
     # It is not recommended to use `.pth` extension when checkpointing models
     # because it collides with Python path (`.pth`) configuration files.
-    merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"]
-    separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities]
+    merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]]
+    separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities]
     config_names = ["torch_config.json", "config.json"]
     tokenizer_names = ["tokenizer.json"]
 
@@ -45,65 +68,58 @@ def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str,
 
     # Ideally, we want to separately fetch all the models.
     # If those aren't available, aggregate separate modalities and merge them.
-    state = None
+    modality_paths = None
     for file_name in merged_model_names:
         if exists(join(model_path, file_name)):
-            state = torch.load(join(model_path, file_name))
+            modality_paths = join(model_path, file_name)
             break
 
-    if state is None:
-        state = {}
-        for file_name in separate_modality_names:
-            if exists(join(model_path, file_name)):
-                modality_name, _, _ = file_name.partition(".")
-                property_name = modality_name + "_encoder"
-                state[property_name] = torch.load(join(model_path, file_name))
+    if modality_paths is None:
+        modality_paths = {}
+        for separate_modality_name in separate_modality_names:
+            if exists(join(model_path, separate_modality_name)):
+                modality_name, _, _ = separate_modality_name.partition(".")
+                modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name)
 
-    return config_path, state, tokenizer_path
+    return config_path, modality_paths, tokenizer_path
 
 
-def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None):
-    from python.uform.torch_encoders import TextVisualEncoder
-    from python.uform.torch_processors import TorchProcessor
+def get_model(
+    model_name: str,
+    *,
+    token: Optional[str] = None,
+    modalities: Optional[Tuple[str]] = None,
+):
+    from uform.torch_encoders import TextVisualEncoder
+    from uform.torch_processors import TorchProcessor
 
-    if modalities is None:
-        modalities = (Modality.TEXT, Modality.IMAGE)
-
-    config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities)
-
-    with open(config_path) as f:
-        config = load(f)
+    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".pt")
+    modality_paths = (
+        {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths
+    )
 
-    model = TextVisualEncoder(config, tokenizer_path)
-    model.image_encoder.load_state_dict(state.get("image_encoder", None))
-    model.text_encoder.load_state_dict(state.get("text_encoder", None))
-    processor = TorchProcessor(config, tokenizer_path)
+    model = TextVisualEncoder(config_path, modality_paths)
+    processor = TorchProcessor(config_path, tokenizer_path)
 
     return model.eval(), processor
 
 
-def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None):
-    from python.uform.onnx_encoders import TextVisualEncoder
-    from python.uform.numpy_processors import NumPyProcessor
+def get_model_onnx(
+    model_name: str,
+    *,
+    device: Literal["cpu", "cuda"] = "cpu",
+    token: Optional[str] = None,
+    modalities: Optional[Tuple[str]] = None,
+):
+    from uform.onnx_encoders import TextVisualEncoder
+    from uform.numpy_processors import NumPyProcessor
 
-    assert device in (
-        "cpu",
-        "gpu",
-    ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`"
-    assert dtype in (
-        "fp32",
-        "fp16",
-    ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)"
-    assert (
-        device == "cpu" and dtype == "fp32"
-    ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported"
-
-    model_path = snapshot_download(repo_id=f"{model_name}-{device}-{dtype}", token=token)
-
-    with open(join(model_path, "config.json")) as f:
-        config = load(f)
+    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".onnx")
+    modality_paths = (
+        {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths
+    )
 
-    model = TextVisualEncoder(model_path, config, device, dtype)
-    processor = NumPyProcessor(config, join(model_path, "tokenizer.json"))
+    model = TextVisualEncoder(config_path, modality_paths, device=device)
+    processor = NumPyProcessor(config_path, tokenizer_path)
 
     return model, processor
diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py
index a556db4..d300504 100644
--- a/python/uform/numpy_processors.py
+++ b/python/uform/numpy_processors.py
@@ -1,5 +1,6 @@
 from os import PathLike
 from typing import Dict, List, Union
+import json
 
 from PIL.Image import Image, BICUBIC
 from tokenizers import Tokenizer
@@ -7,13 +8,14 @@
 
 
 class NumPyProcessor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
         :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
+        config = json.load(open(config_path, "r"))
         self._image_size = config["image_encoder"]["image_size"]
         self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
index 68255de..8201693 100644
--- a/python/uform/onnx_encoders.py
+++ b/python/uform/onnx_encoders.py
@@ -1,5 +1,6 @@
-from os.path import join
-from typing import Dict, Optional, Tuple, Union
+from os import PathLike
+from typing import Dict, Optional, Tuple, Union, Literal
+import json
 
 import onnxruntime as ort
 from numpy import ndarray
@@ -9,18 +10,52 @@ class ExecutionProviderError(Exception):
     """Exception raised when a requested execution provider is not available."""
 
 
-def available_providers(device: str) -> Tuple[str, ...]:
+def available_providers(device: Optional[str]) -> Tuple[str, ...]:
+    """Returns a tuple of available execution providers based on the requested device.
+    https://onnxruntime.ai/docs/execution-providers/
+
+    :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name.
+    :return: Tuple of available execution providers.
+    :raises ExecutionProviderError: If the requested device is not available.
+    """
+
     gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider")
     cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider")
     available = ort.get_available_providers()
-    if device == "gpu":
+
+    # If no target device is specified, let's sort all the available ones with respect to our preference
+    if device is None:
+        preferences = gpu_providers + cpu_providers
+        filtered_preferences = tuple(provider for provider in preferences if provider in available)
+        if len(filtered_preferences):
+            return filtered_preferences
+        if len(available):
+            return available
+        raise ExecutionProviderError("No execution providers are available")
+
+    # If a GPU is requested, but no GPU providers are available, raise an error
+    if device == "gpu" or device == "cuda":
         if all(provider not in available for provider in gpu_providers):
             raise ExecutionProviderError(
                 f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
             )
         return gpu_providers
 
-    return cpu_providers
+    # If a CPU is requested, but no CPU providers are available, raise an error
+    if device == "cpu":
+        if all(provider not in available for provider in cpu_providers):
+            raise ExecutionProviderError(
+                f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}"
+            )
+        return cpu_providers
+
+    if device not in available:
+        available_providers = ", ".join(available)
+        raise ExecutionProviderError(
+            f"Execution provider {device} is not available. Currently installed: {available_providers}"
+        )
+
+    return (device,)
 
 
 class VisualEncoder:
@@ -40,11 +75,11 @@ def __init__(self, model_path: str, device: str):
         )
 
     def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
-        return self.session.run(None, {"images": images})
+        return self.session.run(None, {"input": images})
 
 
 class TextEncoder:
-    def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
+    def __init__(self, text_encoder_path: str, device: str):
         """
         :param text_encoder_path: Path to onnx of text encoder
         :param reranker_path: Path to onnx of reranker
@@ -60,56 +95,35 @@ def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
             providers=available_providers(device),
         )
 
-        self.reranker_session = ort.InferenceSession(
-            reranker_path,
-            sess_options=session_options,
-            providers=available_providers(device),
-        )
-
     def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]:
         return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
 
-    def forward_multimodal(
-        self, text_features: ndarray, attention_mask: ndarray, image_features: ndarray
-    ) -> Tuple[ndarray, ndarray]:
-        return self.reranker_session.run(
-            None,
-            {
-                "text_features": text_features,
-                "attention_mask": attention_mask,
-                "image_features": image_features,
-            },
-        )
-
 
 class TextVisualEncoder:
-    def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
-        assert device in (
-            "cpu",
-            "gpu",
-        ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`"
-        assert dtype in (
-            "fp32",
-            "fp16",
-        ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)"
-        assert (
-            device == "cpu" and dtype == "fp32"
-        ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported"
-
+    def __init__(
+        self,
+        config_path: PathLike,
+        modality_paths: Union[Dict[str, PathLike], PathLike] = None,
+        *,
+        device: Literal["cpu", "cuda"] = "cpu",
+    ):
+        """Initializes the model with the configuration and pre-trained weights.
+
+        :param config_path: Path to the JSON model configuration file
+        :param modality_paths:  Dictionary with paths to different modalities,
+                                or a single path to the model checkpoint
+        """
         self.device = device
-        self.dtype = dtype
 
+        config = json.load(open(config_path, "r"))
         self._embedding_dim = config["text_encoder"]["embedding_dim"]
         self._text_encoder_dim = config["text_encoder"]["dim"]
         self._image_encoder_dim = config["image_encoder"]["dim"]
 
-        self.text_encoder = TextEncoder(
-            join(checkpoint_path, f"text_encoder.onnx"),
-            join(checkpoint_path, f"reranker.onnx"),
-            device,
-        )
-
-        self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device)
+        text_encoder_path = modality_paths.get("text_encoder", None)
+        image_encoder_path = modality_paths.get("image_encoder", None)
+        self.text_encoder = TextEncoder(text_encoder_path, device) if text_encoder_path else None
+        self.image_encoder = VisualEncoder(image_encoder_path, device) if image_encoder_path else None
 
     def encode_image(
         self,
@@ -147,51 +161,6 @@ def encode_text(
 
         return embeddings
 
-    def encode_multimodal(
-        self,
-        image: Optional[ndarray] = None,
-        text: Dict[str, ndarray] = None,
-        image_features: Optional[ndarray] = None,
-        text_features: Optional[ndarray] = None,
-        attention_mask: Optional[ndarray] = None,
-        return_scores: bool = False,
-    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
-        """Passes preprocessed texts (or precomputed texts features) and
-            preprocessed images (or precomputed images features) through multimodal encoded to produce matching scores and optionally multimodal joint embeddings.
-
-        :param image: Preprocessed images
-        :param text: Preprocessed texts
-        :param image_features: Precomputed images features
-        :param text_features: Precomputed text features
-        :param attention_mask: Attention masks, not required if pass `text` instead of text_features
-        """
-
-        assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None"
-        assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None"
-
-        if text_features is not None:
-            assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`"
-
-        if image_features is None:
-            image_features = self.image_encoder(image)
-
-        if text_features is None:
-            text_features = self.text_encoder(
-                text["input_ids"],
-                text["attention_mask"],
-            )
-
-        matching_scores, embeddings = self.text_encoder.forward_multimodal(
-            text_features,
-            attention_mask if attention_mask is not None else text["attention_mask"],
-            image_features,
-        )
-
-        if return_scores:
-            return matching_scores, embeddings
-
-        return embeddings
-
     def forward(
         self,
         images: ndarray,
diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index 4339765..2a0a0c9 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from os import PathLike
 from typing import Dict, Optional, Tuple, Union
+import json
 
 import torch
 import torch.nn as nn
@@ -358,17 +359,45 @@ class TextVisualEncoder(nn.Module):
     Vision-Language Model for Multimodal embeddings.
     """
 
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
-        """
-        :param config: Model config
+    def __init__(
+        self,
+        config_path: PathLike,
+        modality_paths: Union[Dict[str, PathLike], PathLike] = None,
+    ):
+        """Initializes the model with the configuration and pre-trained weights.
+
+        :param config_path: Path to the JSON model configuration file
+        :param modality_paths:  Dictionary with paths to different modalities,
+                                or a single path to the model checkpoint
         """
 
         super().__init__()
-        config["text_encoder"].pop("tokenizer_class", None)
 
+        config = json.load(open(config_path, "r"))
         self._embedding_dim = config["text_encoder"]["embedding_dim"]
-        self.text_encoder = TextEncoder(**config["text_encoder"])
-        self.image_encoder = VisualEncoder(**config["image_encoder"])
+
+        # Both `text_encoder` and `image_encoder` are data-classes, so we must strip
+        # all the non-member attributes before initializing the classes.
+        text_fields = TextEncoder.__dataclass_fields__
+        image_fields = VisualEncoder.__dataclass_fields__
+        text_encoder_attrs = {k: v for k, v in config["text_encoder"].items() if k in text_fields}
+        image_encoder_attrs = {k: v for k, v in config["image_encoder"].items() if k in image_fields}
+        self.text_encoder = TextEncoder(**text_encoder_attrs)
+        self.image_encoder = VisualEncoder(**image_encoder_attrs)
+
+        # Load pre-trained weights
+        if modality_paths is not None:
+            if isinstance(modality_paths, Union[PathLike, str]):
+                state = torch.load(modality_paths)
+                self.text_encoder.load_state_dict(state["text_encoder"])
+                self.image_encoder.load_state_dict(state["image_encoder"])
+            else:
+                text_encoder_path = modality_paths.get("text_encoder", None)
+                image_encoder_path = modality_paths.get("image_encoder", None)
+                if text_encoder_path:
+                    self.text_encoder.load_state_dict(torch.load(text_encoder_path))
+                if image_encoder_path:
+                    self.image_encoder.load_state_dict(torch.load(image_encoder_path))
 
     def encode_image(
         self,
diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py
index 8bdc70b..b435efb 100644
--- a/python/uform/torch_processors.py
+++ b/python/uform/torch_processors.py
@@ -1,5 +1,6 @@
 from os import PathLike
 from typing import Dict, List, Union
+import json
 
 import torch
 from PIL.Image import Image
@@ -15,19 +16,20 @@
 )
 
 
-# lambda is not pickable
+# lambda is not pickle-able
 def convert_to_rgb(image):
     return image.convert("RGB")
 
 
 class TorchProcessor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
         :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
+        config = json.load(open(config_path, "r"))
         self._image_size = config["image_encoder"]["image_size"]
         self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
diff --git a/swift/Embeddings.swift b/swift/Encoders.swift
similarity index 98%
rename from swift/Embeddings.swift
rename to swift/Encoders.swift
index 6d973ac..bc78433 100644
--- a/swift/Embeddings.swift
+++ b/swift/Encoders.swift
@@ -11,6 +11,17 @@ import Foundation
 import Hub  // `Config`
 import Tokenizers  // `AutoTokenizer`
 
+
+enum EncoderError: Error {
+    case configLoadingError(String)
+    case modelLoadingError(String)
+    case unsupportedDataType
+    case invalidInput
+    case unsupportedShapeConstraint
+    case modelPredictionFailed(String)
+}
+
+
 public enum Embedding {
     case i32s([Int32])
     case f16s([Float16])
diff --git a/swift/EmbeddingsTests.swift b/swift/EncodersTests.swift
similarity index 97%
rename from swift/EmbeddingsTests.swift
rename to swift/EncodersTests.swift
index 889cdb6..caab363 100644
--- a/swift/EmbeddingsTests.swift
+++ b/swift/EncodersTests.swift
@@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase {
 
         let api = HubApi(hfToken: "xxx")
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform2-vl-english-small",
+            modelName: "unum-cloud/uform3-image-text-english-small",
             hubApi: api
         )
 
@@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase {
         // A better option is to fetch directly from HuggingFace, similar to how users would do that:
         let api = HubApi(hfToken: "xxx")
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform2-vl-english-small",
+            modelName: "unum-cloud/uform3-image-text-english-small",
             hubApi: api
         )
         let imageModel = try await ImageEncoder(
-            modelName: "unum-cloud/uform2-vl-english-small",
+            modelName: "unum-cloud/uform3-image-text-english-small",
             hubApi: api
         )
 
diff --git a/swift/README.md b/swift/README.md
index 1eebf29..66b531f 100644
--- a/swift/README.md
+++ b/swift/README.md
@@ -19,7 +19,7 @@ import UForm
 ### Text Embeddings
 
 ```swift
-let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
 let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
 let textEmbedding: Embedding = try textModel.forward(with: text)
 let textVector: [Float32] = textEmbedding.asFloats()
@@ -28,7 +28,7 @@ let textVector: [Float32] = textEmbedding.asFloats()
 ### Image Embeddings
 
 ```swift
-let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
 let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
 guard let url = URL(string: imageURL),
     let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),