diff --git a/.gitignore b/.gitignore
index af7d4af..4db8e17 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,13 @@ test
build/
package-lock.json
*.egg-info
-*.onnx
__pycache__
.build
-.swiftpm
\ No newline at end of file
+.swiftpm
+.hf_token
+node_modules
+
+# Tensors & ML Model
+*.onnx
+*.pt
+*.safetensors
diff --git a/README.md b/README.md
index 031c484..32957e7 100755
--- a/README.md
+++ b/README.md
@@ -20,9 +20,11 @@ For Content Understanding and Generation
Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
-Short Texts • Images • 🔜 Video Clips
+Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
-PyTorch • ONNX
+ONNX • CoreML • PyTorch
+
+Python • JavaScript • Swift
---
@@ -279,7 +281,7 @@ The generative model can be used to caption images, summarize their content, or
The exact behavior is controlled by prompts.
```python
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen')
processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen')
diff --git a/javascript/README.md b/javascript/README.md
new file mode 100644
index 0000000..5626d39
--- /dev/null
+++ b/javascript/README.md
@@ -0,0 +1,10 @@
+# UForm for JavaScript
+
+
+
+```bash
+pnpm add uform
+npm add uform
+yarn add uform
+```
+
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..7331231
--- /dev/null
+++ b/package.json
@@ -0,0 +1,11 @@
+{
+ "name": "uform",
+ "private": true,
+ "version": "2.0.2",
+ "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
+ "dependencies": {
+ "@huggingface/hub": "^0.14.8",
+ "@xenova/transformers": "^2.17.0",
+ "onnxruntime-web": "^1.17.3"
+ }
+}
diff --git a/python/scripts/bench.py b/python/scripts/bench.py
index 49c7004..8bcaf37 100644
--- a/python/scripts/bench.py
+++ b/python/scripts/bench.py
@@ -13,7 +13,7 @@
)
from uform import get_model
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
dtype = torch.bfloat16
low_cpu_mem_usage = False
diff --git a/python/scripts/export.ipynb b/python/scripts/export_encoders.ipynb
similarity index 56%
rename from python/scripts/export.ipynb
rename to python/scripts/export_encoders.ipynb
index 7afa4cc..df57858 100644
--- a/python/scripts/export.ipynb
+++ b/python/scripts/export_encoders.ipynb
@@ -36,7 +36,7 @@
"import uform\n",
"from PIL import Image\n",
"\n",
- "model, processor = uform.get_model('unum-cloud/' + model_name)\n",
+ "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
"text = 'a small red panda in a zoo'\n",
"image = Image.open('../../assets/unum.png')\n",
"\n",
@@ -83,66 +83,6 @@
" break # We break after the first layer"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# PyTorch\n",
- "\n",
- "Let's ensure:\n",
- "\n",
- "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
- "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
- "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "list(name for name, _ in model.text_encoder.named_parameters())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Verify input and output names for text_encoder\n",
- "text_encoder_input_names = [name for name, _ in model.text_encoder.named_parameters()]\n",
- "assert 'input_ids' in text_encoder_input_names, \"input_ids not found in text_encoder inputs\"\n",
- "assert 'attention_mask' in text_encoder_input_names, \"attention_mask not found in text_encoder inputs\"\n",
- "\n",
- "text_encoder_output_names = [name for name, _ in model.text_encoder.named_modules()]\n",
- "assert 'embeddings' in text_encoder_output_names, \"embeddings not found in text_encoder outputs\"\n",
- "assert 'features' in text_encoder_output_names, \"features not found in text_encoder outputs\"\n",
- "\n",
- "# Verify input and output names for image_encoder\n",
- "image_encoder_input_names = [name for name, _ in model.image_encoder.named_parameters()]\n",
- "assert 'input' in image_encoder_input_names, \"input not found in image_encoder inputs\"\n",
- "\n",
- "image_encoder_output_names = [name for name, _ in model.image_encoder.named_modules()]\n",
- "assert 'embeddings' in image_encoder_output_names, \"embeddings not found in image_encoder outputs\"\n",
- "assert 'features' in image_encoder_output_names, \"features not found in image_encoder outputs\"\n",
- "\n",
- "# Ensure the model can be converted to f16 half-precision\n",
- "try:\n",
- " model.half() # Convert to half precision\n",
- " print(\"Model successfully converted to half precision (f16).\")\n",
- "except Exception as e:\n",
- " print(f\"An error occurred while converting the model to half precision: {e}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## ONNX"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -241,12 +181,12 @@
"coreml_model = ct.convert(\n",
" traced_script_module, source=\"pytorch\",\n",
" inputs=[image_input], outputs=[image_features, image_embeddings],\n",
- " convert_to='mlprogram', compute_precision=precision)\n",
+ " convert_to='mlprogram', compute_precision=ct.precision)\n",
"\n",
"coreml_model.author = 'Unum Cloud'\n",
"coreml_model.license = 'Apache 2.0'\n",
"coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
- "coreml_model.save(os.path.join(output_directory, model_name + \"-image.mlpackage\"))"
+ "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
]
},
{
@@ -277,7 +217,256 @@
"coreml_model.author = 'Unum Cloud'\n",
"coreml_model.license = 'Apache 2.0'\n",
"coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
- "coreml_model.save(os.path.join(output_directory, model_name + \"-text.mlpackage\"))"
+ "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# PyTorch\n",
+ "\n",
+ "Let's ensure:\n",
+ "\n",
+ "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
+ "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
+ "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from safetensors import safe_open\n",
+ "from safetensors.torch import save_file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.image_encoder.eval()\n",
+ "model.image_encoder.to(dtype=torch.bfloat16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "torch.save(model.image_encoder.state_dict(), 'image.pt')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "save_file(model.image_encoder.state_dict(), \"image.safetensors\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.text_encoder.eval()\n",
+ "model.text_encoder.to(dtype=torch.bfloat16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "torch.save(model.text_encoder.state_dict(), 'text.pt')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "save_file(model.text_encoder.state_dict(), \"text.safetensors\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
+ "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+ "\n",
+ "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n",
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n",
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## ONNX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install onnx onnxconverter-common"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from torch.onnx import export as onnx_export"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = model.text_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "module.to(dtype=torch.float32)\n",
+ "\n",
+ "onnx_export(\n",
+ " module,\n",
+ " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
+ " \"text.onnx\", \n",
+ " export_params=True,\n",
+ " opset_version=15,\n",
+ " do_constant_folding=True,\n",
+ " input_names = ['input_ids', 'attention_mask'], \n",
+ " output_names = ['features', 'embeddings'],\n",
+ " dynamic_axes={\n",
+ " 'input_ids' : {0 : 'batch_size'}, \n",
+ " 'attention_mask' : {0 : 'batch_size'}, \n",
+ " 'features' : {0 : 'batch_size'}, \n",
+ " 'embeddings' : {0 : 'batch_size'}})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import onnx\n",
+ "from onnxconverter_common import float16\n",
+ "\n",
+ "module = onnx.load(\"text.onnx\")\n",
+ "module_fp16 = float16.convert_float_to_float16(module)\n",
+ "onnx.save(module_fp16, \"text.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now repeat the same for images."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = model.image_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "module.to(dtype=torch.float32)\n",
+ "\n",
+ "torch.onnx.export(\n",
+ " module,\n",
+ " image_data, \n",
+ " \"image.onnx\", \n",
+ " export_params=True,\n",
+ " opset_version=15,\n",
+ " do_constant_folding=True,\n",
+ " input_names = ['input'], \n",
+ " output_names = ['features', 'embeddings'],\n",
+ " dynamic_axes={\n",
+ " 'input' : {0 : 'batch_size'},\n",
+ " 'features' : {0 : 'batch_size'},\n",
+ " 'embeddings' : {0 : 'batch_size'}})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import onnx\n",
+ "from onnxconverter_common import float16\n",
+ "\n",
+ "module = onnx.load(\"image.onnx\")\n",
+ "module_fp16 = float16.convert_float_to_float16(module)\n",
+ "onnx.save(module_fp16, \"image.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n",
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx"
]
}
],
@@ -297,7 +486,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.11"
+ "version": "3.11.5"
}
},
"nbformat": 4,
diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py
similarity index 100%
rename from python/scripts/test_generative.py
rename to python/scripts/test_decoders.py
diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_encoders.py
similarity index 87%
rename from python/scripts/test_embeddings.py
rename to python/scripts/test_encoders.py
index d71bf0b..e7541c1 100644
--- a/python/scripts/test_embeddings.py
+++ b/python/scripts/test_encoders.py
@@ -1,4 +1,5 @@
from typing import Tuple
+import os
import pytest
from PIL import Image
@@ -21,6 +22,7 @@
onnx_available = False
torch_models = [
+ "unum-cloud/uform2-vl-english-small",
"unum-cloud/uform-vl-english",
"unum-cloud/uform-vl-multilingual-v2",
]
@@ -34,11 +36,20 @@
("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
]
+# Let's check if the HuggingFace Hub API token is set in the environment variable.
+# If it's not there, check if the `.hf_token` file is present in the current working directory.
+token = os.getenv("HUGGINGFACE_HUB_TOKEN", None)
+if token is None:
+ token_path = "./.hf_token"
+ if os.path.exists(token_path):
+ with open(token_path, "r") as file:
+ token = file.read().strip()
+
@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
@pytest.mark.parametrize("model_name", torch_models)
def test_torch_one_embedding(model_name: str):
- model, processor = uform.get_model(model_name)
+ model, processor = uform.get_model(model_name, token=token)
text = "a small red panda in a zoo"
image_path = "assets/unum.png"
@@ -67,7 +78,7 @@ def test_torch_one_embedding(model_name: str):
@pytest.mark.parametrize("model_name", torch_models)
@pytest.mark.parametrize("batch_size", [1, 2])
def test_torch_many_embeddings(model_name: str, batch_size: int):
- model, processor = uform.get_model(model_name)
+ model, processor = uform.get_model(model_name, token=token)
texts = ["a small red panda in a zoo"] * batch_size
image_paths = ["assets/unum.png"] * batch_size
@@ -90,7 +101,7 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
try:
- model, processor = uform.get_model_onnx(*model_specs)
+ model, processor = uform.get_model_onnx(*model_specs, token=token)
text = "a small red panda in a zoo"
image_path = "assets/unum.png"
@@ -126,7 +137,7 @@ def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int
try:
- model, processor = uform.get_model_onnx(*model_specs)
+ model, processor = uform.get_model_onnx(*model_specs, token=token)
texts = ["a small red panda in a zoo"] * batch_size
image_paths = ["assets/unum.png"] * batch_size
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 1ecb242..f5a15c2 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,40 +1,90 @@
from json import load
-from os.path import join
+from os.path import join, exists
from typing import Mapping, Optional, Tuple
+from enum import Enum
from huggingface_hub import snapshot_download
-def get_checkpoint(model_name: str, token: str) -> Tuple[str, Mapping, str]:
- import torch
-
- model_path = snapshot_download(repo_id=model_name, token=token)
- config_path = join(model_path, "torch_config.json")
+class Modality(Enum):
+ TEXT_ENCODER = "text_encoder"
+ IMAGE_ENCODER = "image_encoder"
- state = torch.load(join(model_path, "torch_weight.pt"))
- return config_path, state, join(model_path, "tokenizer.json")
+def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]:
+ import torch
-def get_model(model_name: str, token: Optional[str] = None):
- from uform.torch_models import VLM
- from uform.torch_preprocessor import TorchProcessor
-
- config_path, state, tokenizer_path = get_checkpoint(model_name, token)
+ # It is not recommended to use `.pth` extension when checkpointing models
+ # because it collides with Python path (`.pth`) configuration files.
+ merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"]
+ separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities]
+ config_names = ["torch_config.json", "config.json"]
+ tokenizer_names = ["tokenizer.json"]
+
+ # The download stats depend on the number of times the `config.json` is pulled
+ # https://huggingface.co/docs/hub/models-download-stats
+ model_path = snapshot_download(
+ repo_id=model_name,
+ token=token,
+ allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names,
+ )
+
+ # Find the first name in `config_names` that is present
+ config_path = None
+ for config_name in config_names:
+ if exists(join(model_path, config_name)):
+ config_path = join(model_path, config_name)
+ break
+
+ # Same for the tokenizer
+ tokenizer_path = None
+ for tokenizer_name in tokenizer_names:
+ if exists(join(model_path, tokenizer_name)):
+ tokenizer_path = join(model_path, tokenizer_name)
+ break
+
+ # Ideally, we want to separately fetch all the models.
+ # If those aren't available, aggregate separate modalities and merge them.
+ state = None
+ for file_name in merged_model_names:
+ if exists(join(model_path, file_name)):
+ state = torch.load(join(model_path, file_name))
+ break
+
+ if state is None:
+ state = {}
+ for file_name in separate_modality_names:
+ if exists(join(model_path, file_name)):
+ modality_name, _, _ = file_name.partition(".")
+ property_name = modality_name + "_encoder"
+ state[property_name] = torch.load(join(model_path, file_name))
+
+ return config_path, state, tokenizer_path
+
+
+def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None):
+ from python.uform.torch_encoders import TextVisualEncoder
+ from python.uform.torch_processors import TorchProcessor
+
+ if modalities is None:
+ modalities = (Modality.TEXT, Modality.IMAGE)
+
+ config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities)
with open(config_path) as f:
config = load(f)
- model = VLM(config, tokenizer_path)
- model.image_encoder.load_state_dict(state["image_encoder"])
- model.text_encoder.load_state_dict(state["text_encoder"])
+ model = TextVisualEncoder(config, tokenizer_path)
+ model.image_encoder.load_state_dict(state.get("image_encoder", None))
+ model.text_encoder.load_state_dict(state.get("text_encoder", None))
processor = TorchProcessor(config, tokenizer_path)
return model.eval(), processor
def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None):
- from uform.onnx_models import VLM_ONNX
- from uform.numpy_preprocessor import NumPyProcessor
+ from python.uform.onnx_encoders import TextVisualEncoder
+ from python.uform.numpy_processors import NumPyProcessor
assert device in (
"cpu",
@@ -53,7 +103,7 @@ def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str
with open(join(model_path, "config.json")) as f:
config = load(f)
- model = VLM_ONNX(model_path, config, device, dtype)
+ model = TextVisualEncoder(model_path, config, device, dtype)
processor = NumPyProcessor(config, join(model_path, "tokenizer.json"))
return model, processor
diff --git a/python/uform/chat.py b/python/uform/chat.py
index 5ef44b7..c9f8dc3 100644
--- a/python/uform/chat.py
+++ b/python/uform/chat.py
@@ -5,7 +5,7 @@
from PIL import Image
from transformers import TextStreamer
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
EOS_TOKEN = 32001
diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py
index c03b6eb..6792120 100644
--- a/python/uform/gen_model.py
+++ b/python/uform/gen_model.py
@@ -1,464 +1 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
- Normalize, RandomResizedCrop, Resize,
- ToTensor)
-from transformers import AutoConfig, AutoTokenizer
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto.modeling_auto import (AutoModel,
- AutoModelForCausalLM)
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import BatchEncoding
-
-from uform.torch_models import VisualEncoder
-
-IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
-IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
-
-
-def convert_to_rgb(image):
- return image.convert("RGB")
-
-
-class LayerScale(nn.Module):
- def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
- super().__init__()
- self.weight = nn.Parameter(init_values * torch.ones(dim))
- self.inplace = inplace
-
- def forward(self, x):
- return x.mul_(self.weight) if self.inplace else x * self.weight
-
-
-class ImageFeaturesPooler(nn.Module):
- def __init__(
- self,
- input_size,
- hidden_size,
- num_attn_heads,
- intermediate_size,
- num_latents,
- initializer_range,
- ):
- super().__init__()
- self.projection = nn.Linear(input_size, hidden_size)
-
- self.pooler = nn.TransformerDecoderLayer(
- hidden_size,
- num_attn_heads,
- intermediate_size,
- activation=nn.functional.silu,
- batch_first=True,
- norm_first=True,
- )
- self.image_latents = nn.Parameter(
- torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
- )
-
- def forward(self, features):
- features = self.projection(features)
- return self.pooler(
- self.image_latents.expand(features.shape[0], -1, -1),
- features,
- )
-
-
-class VLMConfig(PretrainedConfig):
- model_type = "vlm"
-
- def __init__(
- self,
- text_decoder_name_or_path: str = "",
- tokenizer_name_or_path: str = "",
- image_size: int = 224,
- image_encoder_hidden_size: int = 768,
- image_encoder_patch_size: int = 16,
- image_encoder_num_layers: int = 12,
- image_encoder_num_heads: int = 12,
- image_encoder_embedding_dim: int = 256,
- image_encoder_pooling: str = "cls",
- image_pooler_num_attn_heads: int = 16,
- image_pooler_intermediate_size: int = 5504,
- image_pooler_num_latents: int = 196,
- image_token_id: int = 32002,
- initializer_range: float = 0.02,
- use_cache: bool = True,
- center_crop: bool = True,
- **kwargs,
- ):
- self.text_decoder_name_or_path = text_decoder_name_or_path
- self.tokenizer_name_or_path = tokenizer_name_or_path
-
- self.image_size = image_size
- self.image_encoder_hidden_size = image_encoder_hidden_size
- self.image_encoder_patch_size = image_encoder_patch_size
- self.image_encoder_num_layers = image_encoder_num_layers
- self.image_encoder_num_heads = image_encoder_num_heads
- self.image_encoder_embedding_dim = image_encoder_embedding_dim
- self.image_encoder_pooling = image_encoder_pooling
-
- self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
- self.image_pooler_intermediate_size = image_pooler_intermediate_size
- self.image_pooler_num_latents = image_pooler_num_latents
-
- self.image_token_id = image_token_id
-
- self.initializer_range = initializer_range
- self.use_cache = use_cache
- self.center_crop = center_crop
-
- super().__init__(**kwargs)
-
-
-class VLMPreTrainedModel(PreTrainedModel):
- config_class = VLMConfig
- base_model_prefix = "vlm"
- supports_gradient_checkpointing = True
- _no_split_modules = []
- _skip_keys_device_placement = "past_key_values"
-
- def _init_weights(self, module):
- pass
-
- def _initialize_weights(self, module):
- pass
-
-
-class VLMForCausalLM(VLMPreTrainedModel):
- def __init__(self, config: VLMConfig):
- super().__init__(config)
-
- self.config = config
- self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
- self.text_config.vocab_size += 3
- self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
-
- self.image_encoder = VisualEncoder(
- self.config.image_encoder_hidden_size,
- self.config.image_encoder_patch_size,
- self.config.image_size,
- self.config.image_encoder_num_layers,
- self.config.image_encoder_num_heads,
- self.config.image_encoder_embedding_dim,
- self.config.image_encoder_pooling,
- )
-
- # replace models' layerscales because `transformers` automatically renames keys in state_dict
- for i in range(len(self.image_encoder.blocks)):
- self.image_encoder.blocks[i].ls1 = LayerScale(
- self.image_encoder.blocks[i].ls1.dim,
- )
- self.image_encoder.blocks[i].ls2 = LayerScale(
- self.image_encoder.blocks[i].ls2.dim,
- )
-
- self.image_pooler = ImageFeaturesPooler(
- self.config.image_encoder_hidden_size,
- self.text_config.hidden_size,
- self.config.image_pooler_num_attn_heads,
- self.config.image_pooler_intermediate_size,
- self.config.image_pooler_num_latents,
- self.config.initializer_range,
- )
-
- def get_input_embeddings(self):
- return self.text_decoder.get_input_embeddings()
-
- def set_input_embeddings(self, value):
- self.text_decoder.set_input_embeddings(value)
-
- def get_images_embeddings(self, images):
- features = self.image_encoder.forward_features(images)
- return self.image_pooler(features)
-
- def gather_continuous_embeddings(
- self,
- input_ids: torch.Tensor,
- word_embeddings: torch.Tensor,
- image_embeddings: torch.Tensor,
- ) -> torch.Tensor:
- start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
- embeddings = []
-
- for sample_idx, start_idx in enumerate(start_indices.tolist()):
- embeddings.append(
- torch.cat(
- (
- word_embeddings[sample_idx, :start_idx],
- image_embeddings[sample_idx],
- word_embeddings[sample_idx, start_idx + 1 :],
- ),
- dim=0,
- ),
- )
-
- return torch.stack(embeddings, dim=0)
-
- def forward(
- self,
- input_ids: torch.LongTensor = None,
- images: torch.Tensor = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- past_key_values: Optional[List[torch.FloatTensor]] = None,
- inputs_embeds: Optional[torch.FloatTensor] = None,
- use_cache: Optional[bool] = None,
- labels: Optional[torch.Tensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
- output_attentions = (
- output_attentions
- if output_attentions is not None
- else self.config.output_attentions
- )
- output_hidden_states = (
- output_hidden_states
- if output_hidden_states is not None
- else self.config.output_hidden_states
- )
- use_cache = use_cache if use_cache is not None else self.config.use_cache
-
- return_dict = (
- return_dict if return_dict is not None else self.config.use_return_dict
- )
-
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError(
- "You cannot specify both input_ids and inputs_embeds at the same time",
- )
- elif input_ids is None and inputs_embeds is None:
- raise ValueError("You have to specify either input_is or inputs_embeds")
-
- if inputs_embeds is None and past_key_values is None:
- inputs_embeds = self.get_input_embeddings()(input_ids)
-
- if images is not None:
- image_embeds = self.get_images_embeddings(images)
- inputs_embeds = self.gather_continuous_embeddings(
- input_ids,
- inputs_embeds,
- image_embeds,
- )
-
- if position_ids is None:
- seq_length = (
- inputs_embeds.shape[1]
- if inputs_embeds is not None
- else input_ids.shape[1]
- )
- past_key_values_length = 0
-
- if past_key_values is not None:
- past_key_values_length = past_key_values[0][0].shape[2]
-
- device = input_ids.device if input_ids is not None else inputs_embeds.device
- position_ids = torch.arange(
- past_key_values_length,
- seq_length + past_key_values_length,
- dtype=torch.long,
- device=device,
- )
- position_ids = position_ids.unsqueeze(0)
-
- outputs = self.text_decoder(
- inputs_embeds=inputs_embeds,
- input_ids=input_ids if past_key_values is not None else None,
- attention_mask=attention_mask,
- labels=labels,
- position_ids=position_ids,
- past_key_values=past_key_values,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- use_cache=use_cache,
- return_dict=return_dict,
- )
-
- return outputs
-
- def prepare_inputs_for_generation(
- self,
- input_ids,
- images=None,
- past_key_values=None,
- attention_mask=None,
- inputs_embeds=None,
- **kwargs,
- ):
- if past_key_values:
- input_ids = input_ids[:, -1:]
-
- position_ids = kwargs.get("position_ids", None)
- if attention_mask is not None and position_ids is None:
- # create position_ids on the fly for batch generation
- position_ids = attention_mask.long().cumsum(-1) - 1
- position_ids.masked_fill_(attention_mask == 0, 1)
- if past_key_values:
- position_ids = position_ids[:, -1].unsqueeze(-1)
-
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
- if inputs_embeds is not None and past_key_values is None:
- model_inputs = {"inputs_embeds": inputs_embeds}
- else:
- model_inputs = {"input_ids": input_ids}
-
- if images is not None:
- model_inputs["images"] = images
-
- model_inputs.update(
- {
- "position_ids": position_ids,
- "past_key_values": past_key_values,
- "use_cache": kwargs.get("use_cache"),
- "attention_mask": attention_mask,
- "images": images if past_key_values is None else None,
- },
- )
- return model_inputs
-
- @classmethod
- def from_config(cls, config, **kwargs):
- return cls._from_config(config, **kwargs)
-
-
-class VLMProcessor(ProcessorMixin):
- def __init__(self, config, **kwargs):
- self.feature_extractor = None
- self.config = config
-
- if config.center_crop:
- self.image_processor = Compose(
- [
- Resize(256, interpolation=InterpolationMode.BICUBIC),
- CenterCrop(config.image_size),
- convert_to_rgb,
- ToTensor(),
- Normalize(
- mean=IMAGENET_MEAN,
- std=IMAGENET_STD,
- ),
- ],
- )
- else:
- self.image_processor = Compose(
- [
- RandomResizedCrop(
- config.image_size,
- scale=(0.8, 1),
- interpolation=InterpolationMode.BICUBIC,
- ),
- convert_to_rgb,
- ToTensor(),
- Normalize(
- mean=IMAGENET_MEAN,
- std=IMAGENET_STD,
- ),
- ],
- )
-
- self.tokenizer = AutoTokenizer.from_pretrained(
- config.tokenizer_name_or_path,
- additional_special_tokens=["<|im_end|>"],
- )
- self.num_image_latents = config.image_pooler_num_latents
-
- def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
- if texts is not None:
- if isinstance(texts, str):
- texts = [texts]
-
- tokenized_texts = []
- for text in texts:
- messages = [
- {"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": f" {text}"},
- ]
- tokenized_prompt = self.tokenizer.apply_chat_template(
- messages,
- add_generation_prompt=True,
- return_tensors=return_tensors,
- )
-
- tokenized_texts.append(tokenized_prompt)
-
- max_len = max(len(t[0]) for t in tokenized_texts)
- input_ids = torch.full(
- (len(tokenized_texts), max_len),
- fill_value=self.tokenizer.pad_token_id,
- dtype=torch.int64,
- )
- attention_mask = torch.full(
- (len(tokenized_texts), max_len),
- fill_value=0,
- dtype=torch.int64,
- )
-
- for i, tokens in enumerate(tokenized_texts):
- input_ids[i, -len(tokens[0]) :] = tokens[0]
- attention_mask[i, -len(tokens[0]) :] = 1
-
- attention_mask = F.pad(
- attention_mask,
- pad=(0, self.num_image_latents - 1),
- value=1,
- )
-
- encoding = BatchEncoding(
- data={"input_ids": input_ids, "attention_mask": attention_mask},
- )
-
- if images is not None:
- if isinstance(images, (list, tuple)):
- image_features = torch.empty(
- (len(images), 3, self.config.image_size, self.config.image_size),
- dtype=torch.float32,
- )
-
- for i, image in enumerate(images):
- image_features[i] = self.image_processor(image)
- else:
- image_features = self.image_processor(images).unsqueeze(0)
-
- if texts is not None and images is not None:
- encoding["images"] = image_features
- return encoding
-
- if texts is not None:
- return encoding
-
- return BatchEncoding(
- data={
- "images": image_features,
- },
- tensor_type=return_tensors,
- )
-
- def batch_decode(self, *args, **kwargs):
- return self.tokenizer.batch_decode(*args, **kwargs)
-
- def decode(self, *args, **kwargs):
- return self.tokenizer.decode(*args, **kwargs)
-
- @classmethod
- def from_pretrained(
- cls,
- pretrained_model_name_or_path,
- cache_dir=None,
- force_download: bool = False,
- local_files_only: bool = False,
- token=None,
- revision: str = "main",
- **kwargs,
- ):
- config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
- return cls(config)
-
-
-AutoConfig.register("vlm", VLMConfig)
-AutoModel.register(VLMConfig, VLMForCausalLM)
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path
diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_processors.py
similarity index 100%
rename from python/uform/numpy_preprocessor.py
rename to python/uform/numpy_processors.py
diff --git a/python/uform/onnx_models.py b/python/uform/onnx_encoders.py
similarity index 97%
rename from python/uform/onnx_models.py
rename to python/uform/onnx_encoders.py
index 8e2a87a..68255de 100644
--- a/python/uform/onnx_models.py
+++ b/python/uform/onnx_encoders.py
@@ -23,7 +23,7 @@ def available_providers(device: str) -> Tuple[str, ...]:
return cpu_providers
-class VisualEncoderONNX:
+class VisualEncoder:
def __init__(self, model_path: str, device: str):
"""
:param model_path: Path to onnx model
@@ -43,7 +43,7 @@ def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
return self.session.run(None, {"images": images})
-class TextEncoderONNX:
+class TextEncoder:
def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
"""
:param text_encoder_path: Path to onnx of text encoder
@@ -82,7 +82,7 @@ def forward_multimodal(
)
-class VLM_ONNX:
+class TextVisualEncoder:
def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
assert device in (
"cpu",
@@ -103,13 +103,13 @@ def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
self._text_encoder_dim = config["text_encoder"]["dim"]
self._image_encoder_dim = config["image_encoder"]["dim"]
- self.text_encoder = TextEncoderONNX(
+ self.text_encoder = TextEncoder(
join(checkpoint_path, f"text_encoder.onnx"),
join(checkpoint_path, f"reranker.onnx"),
device,
)
- self.image_encoder = VisualEncoderONNX(join(checkpoint_path, f"image_encoder.onnx"), device)
+ self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device)
def encode_image(
self,
@@ -229,3 +229,6 @@ def embedding_dim(self) -> int:
def multimodal_embedding_dim(self) -> int:
"""Dimensionality of multimodal joint embedding."""
return self._text_encoder_dim
+
+
+VLM_ONNX = TextVisualEncoder # legacy
diff --git a/python/uform/preprocessing.py b/python/uform/preprocessing.py
deleted file mode 100644
index d3d833e..0000000
--- a/python/uform/preprocessing.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from os import PathLike
-from typing import Dict, List, Union
-
-import torch
-from PIL import Image
-from tokenizers import Tokenizer
-from torch import Tensor
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
- Normalize, Resize, ToTensor)
-
-
-# lambda is not pickable
-def convert_to_rgb(image):
- return image.convert("RGB")
-
-
-class Processor:
- def __init__(self, config: Dict, tokenizer_path: PathLike, tensor_type: str = "pt"):
- """
- :param config: model config
- :param tokenizer_path: path to tokenizer file
- :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
- """
-
- assert tensor_type in ("pt", "np"), "`tensor_type` must be either `pt` or `np`"
-
- self._image_size = config["image_encoder"]["image_size"]
- self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
- self._tokenizer = Tokenizer.from_file(tokenizer_path)
- self._tokenizer.no_padding()
- self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
- self.tensor_type = tensor_type
-
- self._image_transform = Compose(
- [
- Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
- convert_to_rgb,
- CenterCrop(self._image_size),
- ToTensor(),
- Normalize(
- mean=(0.48145466, 0.4578275, 0.40821073),
- std=(0.26862954, 0.26130258, 0.27577711),
- ),
- ],
- )
-
- def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
- """Transforms one or more strings into dictionary with tokenized strings and attention masks.
-
- :param texts: text of list of texts to tokenizer
- """
- if isinstance(texts, str):
- texts = [texts]
-
- input_ids = torch.full(
- (len(texts), self._max_seq_len),
- fill_value=self._pad_token_idx,
- dtype=torch.int64,
- )
-
- attention_mask = torch.zeros(
- len(texts),
- self._max_seq_len,
- dtype=torch.int32,
- )
- encoded = self._tokenizer.encode_batch(texts)
-
- for i, seq in enumerate(encoded):
- seq_len = min(len(seq), self._max_seq_len)
- input_ids[i, :seq_len] = torch.LongTensor(
- seq.ids[: self._max_seq_len],
- )
- attention_mask[i, :seq_len] = 1
-
- if self.tensor_type == "np":
- return {
- "input_ids": input_ids.numpy(),
- "attention_mask": attention_mask.numpy(),
- }
-
- return {"input_ids": input_ids, "attention_mask": attention_mask}
-
- def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
- """Transforms one or more Pillow images into Torch Tensors.
-
- :param images: image or list of images to preprocess
- """
-
- if isinstance(images, list):
- batch_images = torch.empty(
- (len(images), 3, self._image_size, self._image_size),
- dtype=torch.float32,
- )
-
- for i, image in enumerate(images):
- batch_images[i] = self._image_transform(image)
-
- else:
- batch_images = self._image_transform(images).unsqueeze(0)
-
- if self.tensor_type == "np":
- return batch_images.numpy()
-
- return batch_images
diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py
new file mode 100644
index 0000000..79b058d
--- /dev/null
+++ b/python/uform/torch_decoders.py
@@ -0,0 +1,457 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.transforms import (
+ CenterCrop,
+ Compose,
+ InterpolationMode,
+ Normalize,
+ RandomResizedCrop,
+ Resize,
+ ToTensor,
+)
+from transformers import AutoConfig, AutoTokenizer
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+
+from uform.torch_encoders import VisualEncoder
+
+IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+def convert_to_rgb(image):
+ return image.convert("RGB")
+
+
+class LayerScale(nn.Module):
+ def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
+ super().__init__()
+ self.weight = nn.Parameter(init_values * torch.ones(dim))
+ self.inplace = inplace
+
+ def forward(self, x):
+ return x.mul_(self.weight) if self.inplace else x * self.weight
+
+
+class ImageFeaturesPooler(nn.Module):
+ def __init__(
+ self,
+ input_size,
+ hidden_size,
+ num_attn_heads,
+ intermediate_size,
+ num_latents,
+ initializer_range,
+ ):
+ super().__init__()
+ self.projection = nn.Linear(input_size, hidden_size)
+
+ self.pooler = nn.TransformerDecoderLayer(
+ hidden_size,
+ num_attn_heads,
+ intermediate_size,
+ activation=nn.functional.silu,
+ batch_first=True,
+ norm_first=True,
+ )
+ self.image_latents = nn.Parameter(
+ torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
+ )
+
+ def forward(self, features):
+ features = self.projection(features)
+ return self.pooler(
+ self.image_latents.expand(features.shape[0], -1, -1),
+ features,
+ )
+
+
+class VLMConfig(PretrainedConfig):
+ model_type = "vlm"
+
+ def __init__(
+ self,
+ text_decoder_name_or_path: str = "",
+ tokenizer_name_or_path: str = "",
+ image_size: int = 224,
+ image_encoder_hidden_size: int = 768,
+ image_encoder_patch_size: int = 16,
+ image_encoder_num_layers: int = 12,
+ image_encoder_num_heads: int = 12,
+ image_encoder_embedding_dim: int = 256,
+ image_encoder_pooling: str = "cls",
+ image_pooler_num_attn_heads: int = 16,
+ image_pooler_intermediate_size: int = 5504,
+ image_pooler_num_latents: int = 196,
+ image_token_id: int = 32002,
+ initializer_range: float = 0.02,
+ use_cache: bool = True,
+ center_crop: bool = True,
+ **kwargs,
+ ):
+ self.text_decoder_name_or_path = text_decoder_name_or_path
+ self.tokenizer_name_or_path = tokenizer_name_or_path
+
+ self.image_size = image_size
+ self.image_encoder_hidden_size = image_encoder_hidden_size
+ self.image_encoder_patch_size = image_encoder_patch_size
+ self.image_encoder_num_layers = image_encoder_num_layers
+ self.image_encoder_num_heads = image_encoder_num_heads
+ self.image_encoder_embedding_dim = image_encoder_embedding_dim
+ self.image_encoder_pooling = image_encoder_pooling
+
+ self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
+ self.image_pooler_intermediate_size = image_pooler_intermediate_size
+ self.image_pooler_num_latents = image_pooler_num_latents
+
+ self.image_token_id = image_token_id
+
+ self.initializer_range = initializer_range
+ self.use_cache = use_cache
+ self.center_crop = center_crop
+
+ super().__init__(**kwargs)
+
+
+class VLMPreTrainedModel(PreTrainedModel):
+ config_class = VLMConfig
+ base_model_prefix = "vlm"
+ supports_gradient_checkpointing = True
+ _no_split_modules = []
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ pass
+
+ def _initialize_weights(self, module):
+ pass
+
+
+class VLMForCausalLM(VLMPreTrainedModel):
+ def __init__(self, config: VLMConfig):
+ super().__init__(config)
+
+ self.config = config
+ self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
+ self.text_config.vocab_size += 3
+ self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
+
+ self.image_encoder = VisualEncoder(
+ self.config.image_encoder_hidden_size,
+ self.config.image_encoder_patch_size,
+ self.config.image_size,
+ self.config.image_encoder_num_layers,
+ self.config.image_encoder_num_heads,
+ self.config.image_encoder_embedding_dim,
+ self.config.image_encoder_pooling,
+ )
+
+ # replace models' layerscales because `transformers` automatically renames keys in state_dict
+ for i in range(len(self.image_encoder.blocks)):
+ self.image_encoder.blocks[i].ls1 = LayerScale(
+ self.image_encoder.blocks[i].ls1.dim,
+ )
+ self.image_encoder.blocks[i].ls2 = LayerScale(
+ self.image_encoder.blocks[i].ls2.dim,
+ )
+
+ self.image_pooler = ImageFeaturesPooler(
+ self.config.image_encoder_hidden_size,
+ self.text_config.hidden_size,
+ self.config.image_pooler_num_attn_heads,
+ self.config.image_pooler_intermediate_size,
+ self.config.image_pooler_num_latents,
+ self.config.initializer_range,
+ )
+
+ def get_input_embeddings(self):
+ return self.text_decoder.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.text_decoder.set_input_embeddings(value)
+
+ def get_images_embeddings(self, images):
+ features = self.image_encoder.forward_features(images)
+ return self.image_pooler(features)
+
+ def gather_continuous_embeddings(
+ self,
+ input_ids: torch.Tensor,
+ word_embeddings: torch.Tensor,
+ image_embeddings: torch.Tensor,
+ ) -> torch.Tensor:
+ start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
+ embeddings = []
+
+ for sample_idx, start_idx in enumerate(start_indices.tolist()):
+ embeddings.append(
+ torch.cat(
+ (
+ word_embeddings[sample_idx, :start_idx],
+ image_embeddings[sample_idx],
+ word_embeddings[sample_idx, start_idx + 1 :],
+ ),
+ dim=0,
+ ),
+ )
+
+ return torch.stack(embeddings, dim=0)
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ images: torch.Tensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError(
+ "You cannot specify both input_ids and inputs_embeds at the same time",
+ )
+ elif input_ids is None and inputs_embeds is None:
+ raise ValueError("You have to specify either input_is or inputs_embeds")
+
+ if inputs_embeds is None and past_key_values is None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+
+ if images is not None:
+ image_embeds = self.get_images_embeddings(images)
+ inputs_embeds = self.gather_continuous_embeddings(
+ input_ids,
+ inputs_embeds,
+ image_embeds,
+ )
+
+ if position_ids is None:
+ seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1]
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length,
+ seq_length + past_key_values_length,
+ dtype=torch.long,
+ device=device,
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ outputs = self.text_decoder(
+ inputs_embeds=inputs_embeds,
+ input_ids=input_ids if past_key_values is not None else None,
+ attention_mask=attention_mask,
+ labels=labels,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ use_cache=use_cache,
+ return_dict=return_dict,
+ )
+
+ return outputs
+
+ def prepare_inputs_for_generation(
+ self,
+ input_ids,
+ images=None,
+ past_key_values=None,
+ attention_mask=None,
+ inputs_embeds=None,
+ **kwargs,
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ if images is not None:
+ model_inputs["images"] = images
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ "images": images if past_key_values is None else None,
+ },
+ )
+ return model_inputs
+
+ @classmethod
+ def from_config(cls, config, **kwargs):
+ return cls._from_config(config, **kwargs)
+
+
+class VLMProcessor(ProcessorMixin):
+ def __init__(self, config, **kwargs):
+ self.feature_extractor = None
+ self.config = config
+
+ if config.center_crop:
+ self.image_processor = Compose(
+ [
+ Resize(256, interpolation=InterpolationMode.BICUBIC),
+ CenterCrop(config.image_size),
+ convert_to_rgb,
+ ToTensor(),
+ Normalize(
+ mean=IMAGENET_MEAN,
+ std=IMAGENET_STD,
+ ),
+ ],
+ )
+ else:
+ self.image_processor = Compose(
+ [
+ RandomResizedCrop(
+ config.image_size,
+ scale=(0.8, 1),
+ interpolation=InterpolationMode.BICUBIC,
+ ),
+ convert_to_rgb,
+ ToTensor(),
+ Normalize(
+ mean=IMAGENET_MEAN,
+ std=IMAGENET_STD,
+ ),
+ ],
+ )
+
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ config.tokenizer_name_or_path,
+ additional_special_tokens=["<|im_end|>"],
+ )
+ self.num_image_latents = config.image_pooler_num_latents
+
+ def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
+ if texts is not None:
+ if isinstance(texts, str):
+ texts = [texts]
+
+ tokenized_texts = []
+ for text in texts:
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": f" {text}"},
+ ]
+ tokenized_prompt = self.tokenizer.apply_chat_template(
+ messages,
+ add_generation_prompt=True,
+ return_tensors=return_tensors,
+ )
+
+ tokenized_texts.append(tokenized_prompt)
+
+ max_len = max(len(t[0]) for t in tokenized_texts)
+ input_ids = torch.full(
+ (len(tokenized_texts), max_len),
+ fill_value=self.tokenizer.pad_token_id,
+ dtype=torch.int64,
+ )
+ attention_mask = torch.full(
+ (len(tokenized_texts), max_len),
+ fill_value=0,
+ dtype=torch.int64,
+ )
+
+ for i, tokens in enumerate(tokenized_texts):
+ input_ids[i, -len(tokens[0]) :] = tokens[0]
+ attention_mask[i, -len(tokens[0]) :] = 1
+
+ attention_mask = F.pad(
+ attention_mask,
+ pad=(0, self.num_image_latents - 1),
+ value=1,
+ )
+
+ encoding = BatchEncoding(
+ data={"input_ids": input_ids, "attention_mask": attention_mask},
+ )
+
+ if images is not None:
+ if isinstance(images, (list, tuple)):
+ image_features = torch.empty(
+ (len(images), 3, self.config.image_size, self.config.image_size),
+ dtype=torch.float32,
+ )
+
+ for i, image in enumerate(images):
+ image_features[i] = self.image_processor(image)
+ else:
+ image_features = self.image_processor(images).unsqueeze(0)
+
+ if texts is not None and images is not None:
+ encoding["images"] = image_features
+ return encoding
+
+ if texts is not None:
+ return encoding
+
+ return BatchEncoding(
+ data={
+ "images": image_features,
+ },
+ tensor_type=return_tensors,
+ )
+
+ def batch_decode(self, *args, **kwargs):
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ def decode(self, *args, **kwargs):
+ return self.tokenizer.decode(*args, **kwargs)
+
+ @classmethod
+ def from_pretrained(
+ cls,
+ pretrained_model_name_or_path,
+ cache_dir=None,
+ force_download: bool = False,
+ local_files_only: bool = False,
+ token=None,
+ revision: str = "main",
+ **kwargs,
+ ):
+ config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+ return cls(config)
+
+
+AutoConfig.register("vlm", VLMConfig)
+AutoModel.register(VLMConfig, VLMForCausalLM)
diff --git a/python/uform/torch_models.py b/python/uform/torch_encoders.py
similarity index 99%
rename from python/uform/torch_models.py
rename to python/uform/torch_encoders.py
index ab86622..4339765 100644
--- a/python/uform/torch_models.py
+++ b/python/uform/torch_encoders.py
@@ -353,7 +353,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
return embeddings
-class VLM(nn.Module):
+class TextVisualEncoder(nn.Module):
"""
Vision-Language Model for Multimodal embeddings.
"""
@@ -364,8 +364,9 @@ def __init__(self, config: Dict, tokenizer_path: PathLike):
"""
super().__init__()
- self._embedding_dim = config["text_encoder"]["embedding_dim"]
+ config["text_encoder"].pop("tokenizer_class", None)
+ self._embedding_dim = config["text_encoder"]["embedding_dim"]
self.text_encoder = TextEncoder(**config["text_encoder"])
self.image_encoder = VisualEncoder(**config["image_encoder"])
@@ -503,3 +504,6 @@ def embedding_dim(self) -> int:
def multimodal_embedding_dim(self) -> int:
"""Dimensionality of multimodal joint embedding."""
return self.text_encoder.dim
+
+
+VLM = TextVisualEncoder # legacy
diff --git a/python/uform/torch_preprocessor.py b/python/uform/torch_processors.py
similarity index 100%
rename from python/uform/torch_preprocessor.py
rename to python/uform/torch_processors.py
diff --git a/swift/EmbeddingsTests.swift b/swift/EmbeddingsTests.swift
index 5efb87f..889cdb6 100644
--- a/swift/EmbeddingsTests.swift
+++ b/swift/EmbeddingsTests.swift
@@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase {
let api = HubApi(hfToken: "xxx")
let textModel = try await TextEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: "unum-cloud/uform2-vl-english-small",
hubApi: api
)
@@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase {
// A better option is to fetch directly from HuggingFace, similar to how users would do that:
let api = HubApi(hfToken: "xxx")
let textModel = try await TextEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: "unum-cloud/uform2-vl-english-small",
hubApi: api
)
let imageModel = try await ImageEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: "unum-cloud/uform2-vl-english-small",
hubApi: api
)
diff --git a/swift/README.md b/swift/README.md
new file mode 100644
index 0000000..1eebf29
--- /dev/null
+++ b/swift/README.md
@@ -0,0 +1,44 @@
+# UForm for Swift
+
+UForm offers first-party support for Swift.
+To get started, add UForm to your project using Swift Package Manager.
+
+```bash
+swift package init --type executable
+swift package add uform
+```
+
+Then, import UForm in your Swift code:
+
+```swift
+import UForm
+```
+
+## Embeddings
+
+### Text Embeddings
+
+```swift
+let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
+let textEmbedding: Embedding = try textModel.forward(with: text)
+let textVector: [Float32] = textEmbedding.asFloats()
+```
+
+### Image Embeddings
+
+```swift
+let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
+guard let url = URL(string: imageURL),
+ let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
+ let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) {
+ throw Exception("Could not load image from URL: \(imageURL)")
+}
+
+var imageEmbedding: Embedding = try imageModel.forward(with: cgImage)
+var imageVector: [Float32] = embedding.asFloats()
+```
+
+
+### Computing Distances
\ No newline at end of file