diff --git a/.gitignore b/.gitignore
index af7d4af..4db8e17 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,13 @@ test
 build/
 package-lock.json
 *.egg-info
-*.onnx
 __pycache__
 .build
-.swiftpm
\ No newline at end of file
+.swiftpm
+.hf_token
+node_modules
+
+# Tensors & ML Model
+*.onnx
+*.pt
+*.safetensors
diff --git a/README.md b/README.md
index 031c484..32957e7 100755
--- a/README.md
+++ b/README.md
@@ -20,9 +20,11 @@ For Content Understanding and Generation<br/>
 <p align="center">
 Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
 <br/>
-Short Texts • Images • 🔜 Video Clips
+Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
 <br/>
-PyTorch • ONNX
+ONNX • CoreML • PyTorch
+<br/>
+Python • JavaScript • Swift
 </p>
 
 ---
@@ -279,7 +281,7 @@ The generative model can be used to caption images, summarize their content, or
 The exact behavior is controlled by prompts.
 
 ```python
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 
 model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen')
 processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen')
diff --git a/javascript/README.md b/javascript/README.md
new file mode 100644
index 0000000..5626d39
--- /dev/null
+++ b/javascript/README.md
@@ -0,0 +1,10 @@
+# UForm for JavaScript
+
+
+
+```bash
+pnpm add uform
+npm add uform
+yarn add uform
+```
+
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..7331231
--- /dev/null
+++ b/package.json
@@ -0,0 +1,11 @@
+{
+    "name": "uform",
+    "private": true,
+    "version": "2.0.2",
+    "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
+    "dependencies": {
+        "@huggingface/hub": "^0.14.8",
+        "@xenova/transformers": "^2.17.0",
+        "onnxruntime-web": "^1.17.3"
+    }
+}
diff --git a/python/scripts/bench.py b/python/scripts/bench.py
index 49c7004..8bcaf37 100644
--- a/python/scripts/bench.py
+++ b/python/scripts/bench.py
@@ -13,7 +13,7 @@
 )
 
 from uform import get_model
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 
 dtype = torch.bfloat16
 low_cpu_mem_usage = False
diff --git a/python/scripts/export.ipynb b/python/scripts/export_encoders.ipynb
similarity index 56%
rename from python/scripts/export.ipynb
rename to python/scripts/export_encoders.ipynb
index 7afa4cc..df57858 100644
--- a/python/scripts/export.ipynb
+++ b/python/scripts/export_encoders.ipynb
@@ -36,7 +36,7 @@
     "import uform\n",
     "from PIL import Image\n",
     "\n",
-    "model, processor = uform.get_model('unum-cloud/' + model_name)\n",
+    "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
     "text = 'a small red panda in a zoo'\n",
     "image = Image.open('../../assets/unum.png')\n",
     "\n",
@@ -83,66 +83,6 @@
     "    break  # We break after the first layer"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# PyTorch\n",
-    "\n",
-    "Let's ensure:\n",
-    "\n",
-    "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
-    "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
-    "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "list(name for name, _ in model.text_encoder.named_parameters())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Verify input and output names for text_encoder\n",
-    "text_encoder_input_names = [name for name, _ in model.text_encoder.named_parameters()]\n",
-    "assert 'input_ids' in text_encoder_input_names, \"input_ids not found in text_encoder inputs\"\n",
-    "assert 'attention_mask' in text_encoder_input_names, \"attention_mask not found in text_encoder inputs\"\n",
-    "\n",
-    "text_encoder_output_names = [name for name, _ in model.text_encoder.named_modules()]\n",
-    "assert 'embeddings' in text_encoder_output_names, \"embeddings not found in text_encoder outputs\"\n",
-    "assert 'features' in text_encoder_output_names, \"features not found in text_encoder outputs\"\n",
-    "\n",
-    "# Verify input and output names for image_encoder\n",
-    "image_encoder_input_names = [name for name, _ in model.image_encoder.named_parameters()]\n",
-    "assert 'input' in image_encoder_input_names, \"input not found in image_encoder inputs\"\n",
-    "\n",
-    "image_encoder_output_names = [name for name, _ in model.image_encoder.named_modules()]\n",
-    "assert 'embeddings' in image_encoder_output_names, \"embeddings not found in image_encoder outputs\"\n",
-    "assert 'features' in image_encoder_output_names, \"features not found in image_encoder outputs\"\n",
-    "\n",
-    "# Ensure the model can be converted to f16 half-precision\n",
-    "try:\n",
-    "    model.half()  # Convert to half precision\n",
-    "    print(\"Model successfully converted to half precision (f16).\")\n",
-    "except Exception as e:\n",
-    "    print(f\"An error occurred while converting the model to half precision: {e}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ONNX"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -241,12 +181,12 @@
     "coreml_model = ct.convert(\n",
     "    traced_script_module, source=\"pytorch\",\n",
     "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=precision)\n",
+    "    convert_to='mlprogram', compute_precision=ct.precision)\n",
     "\n",
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(os.path.join(output_directory, model_name + \"-image.mlpackage\"))"
+    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
    ]
   },
   {
@@ -277,7 +217,256 @@
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(os.path.join(output_directory, model_name + \"-text.mlpackage\"))"
+    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PyTorch\n",
+    "\n",
+    "Let's ensure:\n",
+    "\n",
+    "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
+    "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
+    "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from safetensors import safe_open\n",
+    "from safetensors.torch import save_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.image_encoder.eval()\n",
+    "model.image_encoder.to(dtype=torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.image_encoder.state_dict(), 'image.pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_file(model.image_encoder.state_dict(), \"image.safetensors\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.text_encoder.eval()\n",
+    "model.text_encoder.to(dtype=torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.text_encoder.state_dict(), 'text.pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_file(model.text_encoder.state_dict(), \"text.safetensors\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
+    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n",
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ONNX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install onnx onnxconverter-common"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.onnx import export as onnx_export"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.text_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "module.to(dtype=torch.float32)\n",
+    "\n",
+    "onnx_export(\n",
+    "    module,\n",
+    "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
+    "    \"text.onnx\", \n",
+    "    export_params=True,\n",
+    "    opset_version=15,\n",
+    "    do_constant_folding=True,\n",
+    "    input_names = ['input_ids', 'attention_mask'], \n",
+    "    output_names = ['features', 'embeddings'],\n",
+    "    dynamic_axes={\n",
+    "        'input_ids' : {0 : 'batch_size'}, \n",
+    "        'attention_mask' : {0 : 'batch_size'}, \n",
+    "        'features' : {0 : 'batch_size'}, \n",
+    "        'embeddings' : {0 : 'batch_size'}})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "from onnxconverter_common import float16\n",
+    "\n",
+    "module = onnx.load(\"text.onnx\")\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, \"text.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now repeat the same for images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.image_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "module.to(dtype=torch.float32)\n",
+    "\n",
+    "torch.onnx.export(\n",
+    "    module,\n",
+    "    image_data, \n",
+    "    \"image.onnx\", \n",
+    "    export_params=True,\n",
+    "    opset_version=15,\n",
+    "    do_constant_folding=True,\n",
+    "    input_names = ['input'], \n",
+    "    output_names = ['features', 'embeddings'],\n",
+    "    dynamic_axes={\n",
+    "        'input' : {0 : 'batch_size'},\n",
+    "        'features' : {0 : 'batch_size'},\n",
+    "        'embeddings' : {0 : 'batch_size'}})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "from onnxconverter_common import float16\n",
+    "\n",
+    "module = onnx.load(\"image.onnx\")\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, \"image.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx"
    ]
   }
  ],
@@ -297,7 +486,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,
diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py
similarity index 100%
rename from python/scripts/test_generative.py
rename to python/scripts/test_decoders.py
diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_encoders.py
similarity index 87%
rename from python/scripts/test_embeddings.py
rename to python/scripts/test_encoders.py
index d71bf0b..e7541c1 100644
--- a/python/scripts/test_embeddings.py
+++ b/python/scripts/test_encoders.py
@@ -1,4 +1,5 @@
 from typing import Tuple
+import os
 
 import pytest
 from PIL import Image
@@ -21,6 +22,7 @@
     onnx_available = False
 
 torch_models = [
+    "unum-cloud/uform2-vl-english-small",
     "unum-cloud/uform-vl-english",
     "unum-cloud/uform-vl-multilingual-v2",
 ]
@@ -34,11 +36,20 @@
     ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
 ]
 
+# Let's check if the HuggingFace Hub API token is set in the environment variable.
+# If it's not there, check if the `.hf_token` file is present in the current working directory.
+token = os.getenv("HUGGINGFACE_HUB_TOKEN", None)
+if token is None:
+    token_path = "./.hf_token"
+    if os.path.exists(token_path):
+        with open(token_path, "r") as file:
+            token = file.read().strip()
+
 
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
 def test_torch_one_embedding(model_name: str):
-    model, processor = uform.get_model(model_name)
+    model, processor = uform.get_model(model_name, token=token)
     text = "a small red panda in a zoo"
     image_path = "assets/unum.png"
 
@@ -67,7 +78,7 @@ def test_torch_one_embedding(model_name: str):
 @pytest.mark.parametrize("model_name", torch_models)
 @pytest.mark.parametrize("batch_size", [1, 2])
 def test_torch_many_embeddings(model_name: str, batch_size: int):
-    model, processor = uform.get_model(model_name)
+    model, processor = uform.get_model(model_name, token=token)
     texts = ["a small red panda in a zoo"] * batch_size
     image_paths = ["assets/unum.png"] * batch_size
 
@@ -90,7 +101,7 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
 
     try:
 
-        model, processor = uform.get_model_onnx(*model_specs)
+        model, processor = uform.get_model_onnx(*model_specs, token=token)
         text = "a small red panda in a zoo"
         image_path = "assets/unum.png"
 
@@ -126,7 +137,7 @@ def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int
 
     try:
 
-        model, processor = uform.get_model_onnx(*model_specs)
+        model, processor = uform.get_model_onnx(*model_specs, token=token)
         texts = ["a small red panda in a zoo"] * batch_size
         image_paths = ["assets/unum.png"] * batch_size
 
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 1ecb242..f5a15c2 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,40 +1,90 @@
 from json import load
-from os.path import join
+from os.path import join, exists
 from typing import Mapping, Optional, Tuple
+from enum import Enum
 
 from huggingface_hub import snapshot_download
 
 
-def get_checkpoint(model_name: str, token: str) -> Tuple[str, Mapping, str]:
-    import torch
-
-    model_path = snapshot_download(repo_id=model_name, token=token)
-    config_path = join(model_path, "torch_config.json")
+class Modality(Enum):
+    TEXT_ENCODER = "text_encoder"
+    IMAGE_ENCODER = "image_encoder"
 
-    state = torch.load(join(model_path, "torch_weight.pt"))
-    return config_path, state, join(model_path, "tokenizer.json")
 
+def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]:
+    import torch
 
-def get_model(model_name: str, token: Optional[str] = None):
-    from uform.torch_models import VLM
-    from uform.torch_preprocessor import TorchProcessor
-
-    config_path, state, tokenizer_path = get_checkpoint(model_name, token)
+    # It is not recommended to use `.pth` extension when checkpointing models
+    # because it collides with Python path (`.pth`) configuration files.
+    merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"]
+    separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities]
+    config_names = ["torch_config.json", "config.json"]
+    tokenizer_names = ["tokenizer.json"]
+
+    # The download stats depend on the number of times the `config.json` is pulled
+    # https://huggingface.co/docs/hub/models-download-stats
+    model_path = snapshot_download(
+        repo_id=model_name,
+        token=token,
+        allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names,
+    )
+
+    # Find the first name in `config_names` that is present
+    config_path = None
+    for config_name in config_names:
+        if exists(join(model_path, config_name)):
+            config_path = join(model_path, config_name)
+            break
+
+    # Same for the tokenizer
+    tokenizer_path = None
+    for tokenizer_name in tokenizer_names:
+        if exists(join(model_path, tokenizer_name)):
+            tokenizer_path = join(model_path, tokenizer_name)
+            break
+
+    # Ideally, we want to separately fetch all the models.
+    # If those aren't available, aggregate separate modalities and merge them.
+    state = None
+    for file_name in merged_model_names:
+        if exists(join(model_path, file_name)):
+            state = torch.load(join(model_path, file_name))
+            break
+
+    if state is None:
+        state = {}
+        for file_name in separate_modality_names:
+            if exists(join(model_path, file_name)):
+                modality_name, _, _ = file_name.partition(".")
+                property_name = modality_name + "_encoder"
+                state[property_name] = torch.load(join(model_path, file_name))
+
+    return config_path, state, tokenizer_path
+
+
+def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None):
+    from python.uform.torch_encoders import TextVisualEncoder
+    from python.uform.torch_processors import TorchProcessor
+
+    if modalities is None:
+        modalities = (Modality.TEXT, Modality.IMAGE)
+
+    config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities)
 
     with open(config_path) as f:
         config = load(f)
 
-    model = VLM(config, tokenizer_path)
-    model.image_encoder.load_state_dict(state["image_encoder"])
-    model.text_encoder.load_state_dict(state["text_encoder"])
+    model = TextVisualEncoder(config, tokenizer_path)
+    model.image_encoder.load_state_dict(state.get("image_encoder", None))
+    model.text_encoder.load_state_dict(state.get("text_encoder", None))
     processor = TorchProcessor(config, tokenizer_path)
 
     return model.eval(), processor
 
 
 def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None):
-    from uform.onnx_models import VLM_ONNX
-    from uform.numpy_preprocessor import NumPyProcessor
+    from python.uform.onnx_encoders import TextVisualEncoder
+    from python.uform.numpy_processors import NumPyProcessor
 
     assert device in (
         "cpu",
@@ -53,7 +103,7 @@ def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str
     with open(join(model_path, "config.json")) as f:
         config = load(f)
 
-    model = VLM_ONNX(model_path, config, device, dtype)
+    model = TextVisualEncoder(model_path, config, device, dtype)
     processor = NumPyProcessor(config, join(model_path, "tokenizer.json"))
 
     return model, processor
diff --git a/python/uform/chat.py b/python/uform/chat.py
index 5ef44b7..c9f8dc3 100644
--- a/python/uform/chat.py
+++ b/python/uform/chat.py
@@ -5,7 +5,7 @@
 from PIL import Image
 from transformers import TextStreamer
 
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 
 EOS_TOKEN = 32001
 
diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py
index c03b6eb..6792120 100644
--- a/python/uform/gen_model.py
+++ b/python/uform/gen_model.py
@@ -1,464 +1 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
-                                    Normalize, RandomResizedCrop, Resize,
-                                    ToTensor)
-from transformers import AutoConfig, AutoTokenizer
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto.modeling_auto import (AutoModel,
-                                                    AutoModelForCausalLM)
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import BatchEncoding
-
-from uform.torch_models import VisualEncoder
-
-IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
-IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
-
-
-def convert_to_rgb(image):
-    return image.convert("RGB")
-
-
-class LayerScale(nn.Module):
-    def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
-        super().__init__()
-        self.weight = nn.Parameter(init_values * torch.ones(dim))
-        self.inplace = inplace
-
-    def forward(self, x):
-        return x.mul_(self.weight) if self.inplace else x * self.weight
-
-
-class ImageFeaturesPooler(nn.Module):
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        num_attn_heads,
-        intermediate_size,
-        num_latents,
-        initializer_range,
-    ):
-        super().__init__()
-        self.projection = nn.Linear(input_size, hidden_size)
-
-        self.pooler = nn.TransformerDecoderLayer(
-            hidden_size,
-            num_attn_heads,
-            intermediate_size,
-            activation=nn.functional.silu,
-            batch_first=True,
-            norm_first=True,
-        )
-        self.image_latents = nn.Parameter(
-            torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
-        )
-
-    def forward(self, features):
-        features = self.projection(features)
-        return self.pooler(
-            self.image_latents.expand(features.shape[0], -1, -1),
-            features,
-        )
-
-
-class VLMConfig(PretrainedConfig):
-    model_type = "vlm"
-
-    def __init__(
-        self,
-        text_decoder_name_or_path: str = "",
-        tokenizer_name_or_path: str = "",
-        image_size: int = 224,
-        image_encoder_hidden_size: int = 768,
-        image_encoder_patch_size: int = 16,
-        image_encoder_num_layers: int = 12,
-        image_encoder_num_heads: int = 12,
-        image_encoder_embedding_dim: int = 256,
-        image_encoder_pooling: str = "cls",
-        image_pooler_num_attn_heads: int = 16,
-        image_pooler_intermediate_size: int = 5504,
-        image_pooler_num_latents: int = 196,
-        image_token_id: int = 32002,
-        initializer_range: float = 0.02,
-        use_cache: bool = True,
-        center_crop: bool = True,
-        **kwargs,
-    ):
-        self.text_decoder_name_or_path = text_decoder_name_or_path
-        self.tokenizer_name_or_path = tokenizer_name_or_path
-
-        self.image_size = image_size
-        self.image_encoder_hidden_size = image_encoder_hidden_size
-        self.image_encoder_patch_size = image_encoder_patch_size
-        self.image_encoder_num_layers = image_encoder_num_layers
-        self.image_encoder_num_heads = image_encoder_num_heads
-        self.image_encoder_embedding_dim = image_encoder_embedding_dim
-        self.image_encoder_pooling = image_encoder_pooling
-
-        self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
-        self.image_pooler_intermediate_size = image_pooler_intermediate_size
-        self.image_pooler_num_latents = image_pooler_num_latents
-
-        self.image_token_id = image_token_id
-
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.center_crop = center_crop
-
-        super().__init__(**kwargs)
-
-
-class VLMPreTrainedModel(PreTrainedModel):
-    config_class = VLMConfig
-    base_model_prefix = "vlm"
-    supports_gradient_checkpointing = True
-    _no_split_modules = []
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        pass
-
-    def _initialize_weights(self, module):
-        pass
-
-
-class VLMForCausalLM(VLMPreTrainedModel):
-    def __init__(self, config: VLMConfig):
-        super().__init__(config)
-
-        self.config = config
-        self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
-        self.text_config.vocab_size += 3
-        self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
-
-        self.image_encoder = VisualEncoder(
-            self.config.image_encoder_hidden_size,
-            self.config.image_encoder_patch_size,
-            self.config.image_size,
-            self.config.image_encoder_num_layers,
-            self.config.image_encoder_num_heads,
-            self.config.image_encoder_embedding_dim,
-            self.config.image_encoder_pooling,
-        )
-
-        # replace models' layerscales because `transformers` automatically renames keys in state_dict
-        for i in range(len(self.image_encoder.blocks)):
-            self.image_encoder.blocks[i].ls1 = LayerScale(
-                self.image_encoder.blocks[i].ls1.dim,
-            )
-            self.image_encoder.blocks[i].ls2 = LayerScale(
-                self.image_encoder.blocks[i].ls2.dim,
-            )
-
-        self.image_pooler = ImageFeaturesPooler(
-            self.config.image_encoder_hidden_size,
-            self.text_config.hidden_size,
-            self.config.image_pooler_num_attn_heads,
-            self.config.image_pooler_intermediate_size,
-            self.config.image_pooler_num_latents,
-            self.config.initializer_range,
-        )
-
-    def get_input_embeddings(self):
-        return self.text_decoder.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.text_decoder.set_input_embeddings(value)
-
-    def get_images_embeddings(self, images):
-        features = self.image_encoder.forward_features(images)
-        return self.image_pooler(features)
-
-    def gather_continuous_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        word_embeddings: torch.Tensor,
-        image_embeddings: torch.Tensor,
-    ) -> torch.Tensor:
-        start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
-        embeddings = []
-
-        for sample_idx, start_idx in enumerate(start_indices.tolist()):
-            embeddings.append(
-                torch.cat(
-                    (
-                        word_embeddings[sample_idx, :start_idx],
-                        image_embeddings[sample_idx],
-                        word_embeddings[sample_idx, start_idx + 1 :],
-                    ),
-                    dim=0,
-                ),
-            )
-
-        return torch.stack(embeddings, dim=0)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        images: torch.Tensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time",
-            )
-        elif input_ids is None and inputs_embeds is None:
-            raise ValueError("You have to specify either input_is or inputs_embeds")
-
-        if inputs_embeds is None and past_key_values is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-            if images is not None:
-                image_embeds = self.get_images_embeddings(images)
-                inputs_embeds = self.gather_continuous_embeddings(
-                    input_ids,
-                    inputs_embeds,
-                    image_embeds,
-                )
-
-        if position_ids is None:
-            seq_length = (
-                inputs_embeds.shape[1]
-                if inputs_embeds is not None
-                else input_ids.shape[1]
-            )
-            past_key_values_length = 0
-
-            if past_key_values is not None:
-                past_key_values_length = past_key_values[0][0].shape[2]
-
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length,
-                seq_length + past_key_values_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0)
-
-        outputs = self.text_decoder(
-            inputs_embeds=inputs_embeds,
-            input_ids=input_ids if past_key_values is not None else None,
-            attention_mask=attention_mask,
-            labels=labels,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            use_cache=use_cache,
-            return_dict=return_dict,
-        )
-
-        return outputs
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        images=None,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        **kwargs,
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        if images is not None:
-            model_inputs["images"] = images
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "images": images if past_key_values is None else None,
-            },
-        )
-        return model_inputs
-
-    @classmethod
-    def from_config(cls, config, **kwargs):
-        return cls._from_config(config, **kwargs)
-
-
-class VLMProcessor(ProcessorMixin):
-    def __init__(self, config, **kwargs):
-        self.feature_extractor = None
-        self.config = config
-
-        if config.center_crop:
-            self.image_processor = Compose(
-                [
-                    Resize(256, interpolation=InterpolationMode.BICUBIC),
-                    CenterCrop(config.image_size),
-                    convert_to_rgb,
-                    ToTensor(),
-                    Normalize(
-                        mean=IMAGENET_MEAN,
-                        std=IMAGENET_STD,
-                    ),
-                ],
-            )
-        else:
-            self.image_processor = Compose(
-                [
-                    RandomResizedCrop(
-                        config.image_size,
-                        scale=(0.8, 1),
-                        interpolation=InterpolationMode.BICUBIC,
-                    ),
-                    convert_to_rgb,
-                    ToTensor(),
-                    Normalize(
-                        mean=IMAGENET_MEAN,
-                        std=IMAGENET_STD,
-                    ),
-                ],
-            )
-
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            config.tokenizer_name_or_path,
-            additional_special_tokens=["<|im_end|>"],
-        )
-        self.num_image_latents = config.image_pooler_num_latents
-
-    def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
-        if texts is not None:
-            if isinstance(texts, str):
-                texts = [texts]
-
-            tokenized_texts = []
-            for text in texts:
-                messages = [
-                    {"role": "system", "content": "You are a helpful assistant."},
-                    {"role": "user", "content": f" <image> {text}"},
-                ]
-                tokenized_prompt = self.tokenizer.apply_chat_template(
-                    messages,
-                    add_generation_prompt=True,
-                    return_tensors=return_tensors,
-                )
-
-                tokenized_texts.append(tokenized_prompt)
-
-            max_len = max(len(t[0]) for t in tokenized_texts)
-            input_ids = torch.full(
-                (len(tokenized_texts), max_len),
-                fill_value=self.tokenizer.pad_token_id,
-                dtype=torch.int64,
-            )
-            attention_mask = torch.full(
-                (len(tokenized_texts), max_len),
-                fill_value=0,
-                dtype=torch.int64,
-            )
-
-            for i, tokens in enumerate(tokenized_texts):
-                input_ids[i, -len(tokens[0]) :] = tokens[0]
-                attention_mask[i, -len(tokens[0]) :] = 1
-
-            attention_mask = F.pad(
-                attention_mask,
-                pad=(0, self.num_image_latents - 1),
-                value=1,
-            )
-
-            encoding = BatchEncoding(
-                data={"input_ids": input_ids, "attention_mask": attention_mask},
-            )
-
-        if images is not None:
-            if isinstance(images, (list, tuple)):
-                image_features = torch.empty(
-                    (len(images), 3, self.config.image_size, self.config.image_size),
-                    dtype=torch.float32,
-                )
-
-                for i, image in enumerate(images):
-                    image_features[i] = self.image_processor(image)
-            else:
-                image_features = self.image_processor(images).unsqueeze(0)
-
-        if texts is not None and images is not None:
-            encoding["images"] = image_features
-            return encoding
-
-        if texts is not None:
-            return encoding
-
-        return BatchEncoding(
-            data={
-                "images": image_features,
-            },
-            tensor_type=return_tensors,
-        )
-
-    def batch_decode(self, *args, **kwargs):
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    def decode(self, *args, **kwargs):
-        return self.tokenizer.decode(*args, **kwargs)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path,
-        cache_dir=None,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token=None,
-        revision: str = "main",
-        **kwargs,
-    ):
-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-        return cls(config)
-
-
-AutoConfig.register("vlm", VLMConfig)
-AutoModel.register(VLMConfig, VLMForCausalLM)
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path
diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_processors.py
similarity index 100%
rename from python/uform/numpy_preprocessor.py
rename to python/uform/numpy_processors.py
diff --git a/python/uform/onnx_models.py b/python/uform/onnx_encoders.py
similarity index 97%
rename from python/uform/onnx_models.py
rename to python/uform/onnx_encoders.py
index 8e2a87a..68255de 100644
--- a/python/uform/onnx_models.py
+++ b/python/uform/onnx_encoders.py
@@ -23,7 +23,7 @@ def available_providers(device: str) -> Tuple[str, ...]:
     return cpu_providers
 
 
-class VisualEncoderONNX:
+class VisualEncoder:
     def __init__(self, model_path: str, device: str):
         """
         :param model_path: Path to onnx model
@@ -43,7 +43,7 @@ def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
         return self.session.run(None, {"images": images})
 
 
-class TextEncoderONNX:
+class TextEncoder:
     def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
         """
         :param text_encoder_path: Path to onnx of text encoder
@@ -82,7 +82,7 @@ def forward_multimodal(
         )
 
 
-class VLM_ONNX:
+class TextVisualEncoder:
     def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
         assert device in (
             "cpu",
@@ -103,13 +103,13 @@ def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
         self._text_encoder_dim = config["text_encoder"]["dim"]
         self._image_encoder_dim = config["image_encoder"]["dim"]
 
-        self.text_encoder = TextEncoderONNX(
+        self.text_encoder = TextEncoder(
             join(checkpoint_path, f"text_encoder.onnx"),
             join(checkpoint_path, f"reranker.onnx"),
             device,
         )
 
-        self.image_encoder = VisualEncoderONNX(join(checkpoint_path, f"image_encoder.onnx"), device)
+        self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device)
 
     def encode_image(
         self,
@@ -229,3 +229,6 @@ def embedding_dim(self) -> int:
     def multimodal_embedding_dim(self) -> int:
         """Dimensionality of multimodal joint embedding."""
         return self._text_encoder_dim
+
+
+VLM_ONNX = TextVisualEncoder  # legacy
diff --git a/python/uform/preprocessing.py b/python/uform/preprocessing.py
deleted file mode 100644
index d3d833e..0000000
--- a/python/uform/preprocessing.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from os import PathLike
-from typing import Dict, List, Union
-
-import torch
-from PIL import Image
-from tokenizers import Tokenizer
-from torch import Tensor
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
-                                    Normalize, Resize, ToTensor)
-
-
-# lambda is not pickable
-def convert_to_rgb(image):
-    return image.convert("RGB")
-
-
-class Processor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike, tensor_type: str = "pt"):
-        """
-        :param config: model config
-        :param tokenizer_path: path to tokenizer file
-        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
-        """
-
-        assert tensor_type in ("pt", "np"), "`tensor_type` must be either `pt` or `np`"
-
-        self._image_size = config["image_encoder"]["image_size"]
-        self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
-        self._tokenizer = Tokenizer.from_file(tokenizer_path)
-        self._tokenizer.no_padding()
-        self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
-        self.tensor_type = tensor_type
-
-        self._image_transform = Compose(
-            [
-                Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
-                convert_to_rgb,
-                CenterCrop(self._image_size),
-                ToTensor(),
-                Normalize(
-                    mean=(0.48145466, 0.4578275, 0.40821073),
-                    std=(0.26862954, 0.26130258, 0.27577711),
-                ),
-            ],
-        )
-
-    def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
-        """Transforms one or more strings into dictionary with tokenized strings and attention masks.
-
-        :param texts: text of list of texts to tokenizer
-        """
-        if isinstance(texts, str):
-            texts = [texts]
-
-        input_ids = torch.full(
-            (len(texts), self._max_seq_len),
-            fill_value=self._pad_token_idx,
-            dtype=torch.int64,
-        )
-
-        attention_mask = torch.zeros(
-            len(texts),
-            self._max_seq_len,
-            dtype=torch.int32,
-        )
-        encoded = self._tokenizer.encode_batch(texts)
-
-        for i, seq in enumerate(encoded):
-            seq_len = min(len(seq), self._max_seq_len)
-            input_ids[i, :seq_len] = torch.LongTensor(
-                seq.ids[: self._max_seq_len],
-            )
-            attention_mask[i, :seq_len] = 1
-
-        if self.tensor_type == "np":
-            return {
-                "input_ids": input_ids.numpy(),
-                "attention_mask": attention_mask.numpy(),
-            }
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-    def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
-        """Transforms one or more Pillow images into Torch Tensors.
-
-        :param images: image or list of images to preprocess
-        """
-
-        if isinstance(images, list):
-            batch_images = torch.empty(
-                (len(images), 3, self._image_size, self._image_size),
-                dtype=torch.float32,
-            )
-
-            for i, image in enumerate(images):
-                batch_images[i] = self._image_transform(image)
-
-        else:
-            batch_images = self._image_transform(images).unsqueeze(0)
-
-        if self.tensor_type == "np":
-            return batch_images.numpy()
-
-        return batch_images
diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py
new file mode 100644
index 0000000..79b058d
--- /dev/null
+++ b/python/uform/torch_decoders.py
@@ -0,0 +1,457 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    InterpolationMode,
+    Normalize,
+    RandomResizedCrop,
+    Resize,
+    ToTensor,
+)
+from transformers import AutoConfig, AutoTokenizer
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+
+from uform.torch_encoders import VisualEncoder
+
+IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+def convert_to_rgb(image):
+    return image.convert("RGB")
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
+        super().__init__()
+        self.weight = nn.Parameter(init_values * torch.ones(dim))
+        self.inplace = inplace
+
+    def forward(self, x):
+        return x.mul_(self.weight) if self.inplace else x * self.weight
+
+
+class ImageFeaturesPooler(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_attn_heads,
+        intermediate_size,
+        num_latents,
+        initializer_range,
+    ):
+        super().__init__()
+        self.projection = nn.Linear(input_size, hidden_size)
+
+        self.pooler = nn.TransformerDecoderLayer(
+            hidden_size,
+            num_attn_heads,
+            intermediate_size,
+            activation=nn.functional.silu,
+            batch_first=True,
+            norm_first=True,
+        )
+        self.image_latents = nn.Parameter(
+            torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
+        )
+
+    def forward(self, features):
+        features = self.projection(features)
+        return self.pooler(
+            self.image_latents.expand(features.shape[0], -1, -1),
+            features,
+        )
+
+
+class VLMConfig(PretrainedConfig):
+    model_type = "vlm"
+
+    def __init__(
+        self,
+        text_decoder_name_or_path: str = "",
+        tokenizer_name_or_path: str = "",
+        image_size: int = 224,
+        image_encoder_hidden_size: int = 768,
+        image_encoder_patch_size: int = 16,
+        image_encoder_num_layers: int = 12,
+        image_encoder_num_heads: int = 12,
+        image_encoder_embedding_dim: int = 256,
+        image_encoder_pooling: str = "cls",
+        image_pooler_num_attn_heads: int = 16,
+        image_pooler_intermediate_size: int = 5504,
+        image_pooler_num_latents: int = 196,
+        image_token_id: int = 32002,
+        initializer_range: float = 0.02,
+        use_cache: bool = True,
+        center_crop: bool = True,
+        **kwargs,
+    ):
+        self.text_decoder_name_or_path = text_decoder_name_or_path
+        self.tokenizer_name_or_path = tokenizer_name_or_path
+
+        self.image_size = image_size
+        self.image_encoder_hidden_size = image_encoder_hidden_size
+        self.image_encoder_patch_size = image_encoder_patch_size
+        self.image_encoder_num_layers = image_encoder_num_layers
+        self.image_encoder_num_heads = image_encoder_num_heads
+        self.image_encoder_embedding_dim = image_encoder_embedding_dim
+        self.image_encoder_pooling = image_encoder_pooling
+
+        self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
+        self.image_pooler_intermediate_size = image_pooler_intermediate_size
+        self.image_pooler_num_latents = image_pooler_num_latents
+
+        self.image_token_id = image_token_id
+
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.center_crop = center_crop
+
+        super().__init__(**kwargs)
+
+
+class VLMPreTrainedModel(PreTrainedModel):
+    config_class = VLMConfig
+    base_model_prefix = "vlm"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        pass
+
+    def _initialize_weights(self, module):
+        pass
+
+
+class VLMForCausalLM(VLMPreTrainedModel):
+    def __init__(self, config: VLMConfig):
+        super().__init__(config)
+
+        self.config = config
+        self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
+        self.text_config.vocab_size += 3
+        self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
+
+        self.image_encoder = VisualEncoder(
+            self.config.image_encoder_hidden_size,
+            self.config.image_encoder_patch_size,
+            self.config.image_size,
+            self.config.image_encoder_num_layers,
+            self.config.image_encoder_num_heads,
+            self.config.image_encoder_embedding_dim,
+            self.config.image_encoder_pooling,
+        )
+
+        # replace models' layerscales because `transformers` automatically renames keys in state_dict
+        for i in range(len(self.image_encoder.blocks)):
+            self.image_encoder.blocks[i].ls1 = LayerScale(
+                self.image_encoder.blocks[i].ls1.dim,
+            )
+            self.image_encoder.blocks[i].ls2 = LayerScale(
+                self.image_encoder.blocks[i].ls2.dim,
+            )
+
+        self.image_pooler = ImageFeaturesPooler(
+            self.config.image_encoder_hidden_size,
+            self.text_config.hidden_size,
+            self.config.image_pooler_num_attn_heads,
+            self.config.image_pooler_intermediate_size,
+            self.config.image_pooler_num_latents,
+            self.config.initializer_range,
+        )
+
+    def get_input_embeddings(self):
+        return self.text_decoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_decoder.set_input_embeddings(value)
+
+    def get_images_embeddings(self, images):
+        features = self.image_encoder.forward_features(images)
+        return self.image_pooler(features)
+
+    def gather_continuous_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        word_embeddings: torch.Tensor,
+        image_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
+        embeddings = []
+
+        for sample_idx, start_idx in enumerate(start_indices.tolist()):
+            embeddings.append(
+                torch.cat(
+                    (
+                        word_embeddings[sample_idx, :start_idx],
+                        image_embeddings[sample_idx],
+                        word_embeddings[sample_idx, start_idx + 1 :],
+                    ),
+                    dim=0,
+                ),
+            )
+
+        return torch.stack(embeddings, dim=0)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        images: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time",
+            )
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_is or inputs_embeds")
+
+        if inputs_embeds is None and past_key_values is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            if images is not None:
+                image_embeds = self.get_images_embeddings(images)
+                inputs_embeds = self.gather_continuous_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    image_embeds,
+                )
+
+        if position_ids is None:
+            seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1]
+            past_key_values_length = 0
+
+            if past_key_values is not None:
+                past_key_values_length = past_key_values[0][0].shape[2]
+
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        outputs = self.text_decoder(
+            inputs_embeds=inputs_embeds,
+            input_ids=input_ids if past_key_values is not None else None,
+            attention_mask=attention_mask,
+            labels=labels,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        images=None,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        if images is not None:
+            model_inputs["images"] = images
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": images if past_key_values is None else None,
+            },
+        )
+        return model_inputs
+
+    @classmethod
+    def from_config(cls, config, **kwargs):
+        return cls._from_config(config, **kwargs)
+
+
+class VLMProcessor(ProcessorMixin):
+    def __init__(self, config, **kwargs):
+        self.feature_extractor = None
+        self.config = config
+
+        if config.center_crop:
+            self.image_processor = Compose(
+                [
+                    Resize(256, interpolation=InterpolationMode.BICUBIC),
+                    CenterCrop(config.image_size),
+                    convert_to_rgb,
+                    ToTensor(),
+                    Normalize(
+                        mean=IMAGENET_MEAN,
+                        std=IMAGENET_STD,
+                    ),
+                ],
+            )
+        else:
+            self.image_processor = Compose(
+                [
+                    RandomResizedCrop(
+                        config.image_size,
+                        scale=(0.8, 1),
+                        interpolation=InterpolationMode.BICUBIC,
+                    ),
+                    convert_to_rgb,
+                    ToTensor(),
+                    Normalize(
+                        mean=IMAGENET_MEAN,
+                        std=IMAGENET_STD,
+                    ),
+                ],
+            )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            config.tokenizer_name_or_path,
+            additional_special_tokens=["<|im_end|>"],
+        )
+        self.num_image_latents = config.image_pooler_num_latents
+
+    def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
+        if texts is not None:
+            if isinstance(texts, str):
+                texts = [texts]
+
+            tokenized_texts = []
+            for text in texts:
+                messages = [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": f" <image> {text}"},
+                ]
+                tokenized_prompt = self.tokenizer.apply_chat_template(
+                    messages,
+                    add_generation_prompt=True,
+                    return_tensors=return_tensors,
+                )
+
+                tokenized_texts.append(tokenized_prompt)
+
+            max_len = max(len(t[0]) for t in tokenized_texts)
+            input_ids = torch.full(
+                (len(tokenized_texts), max_len),
+                fill_value=self.tokenizer.pad_token_id,
+                dtype=torch.int64,
+            )
+            attention_mask = torch.full(
+                (len(tokenized_texts), max_len),
+                fill_value=0,
+                dtype=torch.int64,
+            )
+
+            for i, tokens in enumerate(tokenized_texts):
+                input_ids[i, -len(tokens[0]) :] = tokens[0]
+                attention_mask[i, -len(tokens[0]) :] = 1
+
+            attention_mask = F.pad(
+                attention_mask,
+                pad=(0, self.num_image_latents - 1),
+                value=1,
+            )
+
+            encoding = BatchEncoding(
+                data={"input_ids": input_ids, "attention_mask": attention_mask},
+            )
+
+        if images is not None:
+            if isinstance(images, (list, tuple)):
+                image_features = torch.empty(
+                    (len(images), 3, self.config.image_size, self.config.image_size),
+                    dtype=torch.float32,
+                )
+
+                for i, image in enumerate(images):
+                    image_features[i] = self.image_processor(image)
+            else:
+                image_features = self.image_processor(images).unsqueeze(0)
+
+        if texts is not None and images is not None:
+            encoding["images"] = image_features
+            return encoding
+
+        if texts is not None:
+            return encoding
+
+        return BatchEncoding(
+            data={
+                "images": image_features,
+            },
+            tensor_type=return_tensors,
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        cache_dir=None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token=None,
+        revision: str = "main",
+        **kwargs,
+    ):
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+        return cls(config)
+
+
+AutoConfig.register("vlm", VLMConfig)
+AutoModel.register(VLMConfig, VLMForCausalLM)
diff --git a/python/uform/torch_models.py b/python/uform/torch_encoders.py
similarity index 99%
rename from python/uform/torch_models.py
rename to python/uform/torch_encoders.py
index ab86622..4339765 100644
--- a/python/uform/torch_models.py
+++ b/python/uform/torch_encoders.py
@@ -353,7 +353,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
         return embeddings
 
 
-class VLM(nn.Module):
+class TextVisualEncoder(nn.Module):
     """
     Vision-Language Model for Multimodal embeddings.
     """
@@ -364,8 +364,9 @@ def __init__(self, config: Dict, tokenizer_path: PathLike):
         """
 
         super().__init__()
-        self._embedding_dim = config["text_encoder"]["embedding_dim"]
+        config["text_encoder"].pop("tokenizer_class", None)
 
+        self._embedding_dim = config["text_encoder"]["embedding_dim"]
         self.text_encoder = TextEncoder(**config["text_encoder"])
         self.image_encoder = VisualEncoder(**config["image_encoder"])
 
@@ -503,3 +504,6 @@ def embedding_dim(self) -> int:
     def multimodal_embedding_dim(self) -> int:
         """Dimensionality of multimodal joint embedding."""
         return self.text_encoder.dim
+
+
+VLM = TextVisualEncoder  # legacy
diff --git a/python/uform/torch_preprocessor.py b/python/uform/torch_processors.py
similarity index 100%
rename from python/uform/torch_preprocessor.py
rename to python/uform/torch_processors.py
diff --git a/swift/EmbeddingsTests.swift b/swift/EmbeddingsTests.swift
index 5efb87f..889cdb6 100644
--- a/swift/EmbeddingsTests.swift
+++ b/swift/EmbeddingsTests.swift
@@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase {
 
         let api = HubApi(hfToken: "xxx")
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: "unum-cloud/uform2-vl-english-small",
             hubApi: api
         )
 
@@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase {
         // A better option is to fetch directly from HuggingFace, similar to how users would do that:
         let api = HubApi(hfToken: "xxx")
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: "unum-cloud/uform2-vl-english-small",
             hubApi: api
         )
         let imageModel = try await ImageEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: "unum-cloud/uform2-vl-english-small",
             hubApi: api
         )
 
diff --git a/swift/README.md b/swift/README.md
new file mode 100644
index 0000000..1eebf29
--- /dev/null
+++ b/swift/README.md
@@ -0,0 +1,44 @@
+# UForm for Swift
+
+UForm offers first-party support for Swift.
+To get started, add UForm to your project using Swift Package Manager.
+
+```bash
+swift package init --type executable
+swift package add uform
+```
+
+Then, import UForm in your Swift code:
+
+```swift
+import UForm
+```
+
+## Embeddings
+
+### Text Embeddings
+
+```swift
+let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
+let textEmbedding: Embedding = try textModel.forward(with: text)
+let textVector: [Float32] = textEmbedding.asFloats()
+```
+
+### Image Embeddings
+
+```swift
+let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
+guard let url = URL(string: imageURL),
+    let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
+    let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) {
+    throw Exception("Could not load image from URL: \(imageURL)")
+}
+
+var imageEmbedding: Embedding = try imageModel.forward(with: cgImage)
+var imageVector: [Float32] = embedding.asFloats()
+```
+
+
+### Computing Distances
\ No newline at end of file