diff --git a/.gitignore b/.gitignore index af7d4af..4db8e17 100755 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,13 @@ test build/ package-lock.json *.egg-info -*.onnx __pycache__ .build -.swiftpm \ No newline at end of file +.swiftpm +.hf_token +node_modules + +# Tensors & ML Model +*.onnx +*.pt +*.safetensors diff --git a/README.md b/README.md index 031c484..32957e7 100755 --- a/README.md +++ b/README.md @@ -20,9 +20,11 @@ For Content Understanding and Generation

Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
-Short Texts • Images • 🔜 Video Clips +Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
-PyTorch • ONNX +ONNX • CoreML • PyTorch +
+Python • JavaScript • Swift

--- @@ -279,7 +281,7 @@ The generative model can be used to caption images, summarize their content, or The exact behavior is controlled by prompts. ```python -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen') processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen') diff --git a/javascript/README.md b/javascript/README.md new file mode 100644 index 0000000..5626d39 --- /dev/null +++ b/javascript/README.md @@ -0,0 +1,10 @@ +# UForm for JavaScript + + + +```bash +pnpm add uform +npm add uform +yarn add uform +``` + diff --git a/package.json b/package.json new file mode 100644 index 0000000..7331231 --- /dev/null +++ b/package.json @@ -0,0 +1,11 @@ +{ + "name": "uform", + "private": true, + "version": "2.0.2", + "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation", + "dependencies": { + "@huggingface/hub": "^0.14.8", + "@xenova/transformers": "^2.17.0", + "onnxruntime-web": "^1.17.3" + } +} diff --git a/python/scripts/bench.py b/python/scripts/bench.py index 49c7004..8bcaf37 100644 --- a/python/scripts/bench.py +++ b/python/scripts/bench.py @@ -13,7 +13,7 @@ ) from uform import get_model -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor dtype = torch.bfloat16 low_cpu_mem_usage = False diff --git a/python/scripts/export.ipynb b/python/scripts/export_encoders.ipynb similarity index 56% rename from python/scripts/export.ipynb rename to python/scripts/export_encoders.ipynb index 7afa4cc..df57858 100644 --- a/python/scripts/export.ipynb +++ b/python/scripts/export_encoders.ipynb @@ -36,7 +36,7 @@ "import uform\n", "from PIL import Image\n", "\n", - "model, processor = uform.get_model('unum-cloud/' + model_name)\n", + "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n", "text = 'a small red panda in a zoo'\n", "image = Image.open('../../assets/unum.png')\n", "\n", @@ -83,66 +83,6 @@ " break # We break after the first layer" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PyTorch\n", - "\n", - "Let's ensure:\n", - "\n", - "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n", - "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n", - "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list(name for name, _ in model.text_encoder.named_parameters())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Verify input and output names for text_encoder\n", - "text_encoder_input_names = [name for name, _ in model.text_encoder.named_parameters()]\n", - "assert 'input_ids' in text_encoder_input_names, \"input_ids not found in text_encoder inputs\"\n", - "assert 'attention_mask' in text_encoder_input_names, \"attention_mask not found in text_encoder inputs\"\n", - "\n", - "text_encoder_output_names = [name for name, _ in model.text_encoder.named_modules()]\n", - "assert 'embeddings' in text_encoder_output_names, \"embeddings not found in text_encoder outputs\"\n", - "assert 'features' in text_encoder_output_names, \"features not found in text_encoder outputs\"\n", - "\n", - "# Verify input and output names for image_encoder\n", - "image_encoder_input_names = [name for name, _ in model.image_encoder.named_parameters()]\n", - "assert 'input' in image_encoder_input_names, \"input not found in image_encoder inputs\"\n", - "\n", - "image_encoder_output_names = [name for name, _ in model.image_encoder.named_modules()]\n", - "assert 'embeddings' in image_encoder_output_names, \"embeddings not found in image_encoder outputs\"\n", - "assert 'features' in image_encoder_output_names, \"features not found in image_encoder outputs\"\n", - "\n", - "# Ensure the model can be converted to f16 half-precision\n", - "try:\n", - " model.half() # Convert to half precision\n", - " print(\"Model successfully converted to half precision (f16).\")\n", - "except Exception as e:\n", - " print(f\"An error occurred while converting the model to half precision: {e}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ONNX" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -241,12 +181,12 @@ "coreml_model = ct.convert(\n", " traced_script_module, source=\"pytorch\",\n", " inputs=[image_input], outputs=[image_features, image_embeddings],\n", - " convert_to='mlprogram', compute_precision=precision)\n", + " convert_to='mlprogram', compute_precision=ct.precision)\n", "\n", "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(os.path.join(output_directory, model_name + \"-image.mlpackage\"))" + "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")" ] }, { @@ -277,7 +217,256 @@ "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(os.path.join(output_directory, model_name + \"-text.mlpackage\"))" + "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyTorch\n", + "\n", + "Let's ensure:\n", + "\n", + "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n", + "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n", + "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from safetensors import safe_open\n", + "from safetensors.torch import save_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.image_encoder.eval()\n", + "model.image_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.image_encoder.state_dict(), 'image.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(model.image_encoder.state_dict(), \"image.safetensors\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.text_encoder.eval()\n", + "model.text_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.text_encoder.state_dict(), 'text.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(model.text_encoder.state_dict(), \"text.safetensors\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n", + "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install onnx onnxconverter-common" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.onnx import export as onnx_export" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "onnx_export(\n", + " module,\n", + " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", + " \"text.onnx\", \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input_ids', 'attention_mask'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input_ids' : {0 : 'batch_size'}, \n", + " 'attention_mask' : {0 : 'batch_size'}, \n", + " 'features' : {0 : 'batch_size'}, \n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "from onnxconverter_common import float16\n", + "\n", + "module = onnx.load(\"text.onnx\")\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, \"text.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now repeat the same for images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "torch.onnx.export(\n", + " module,\n", + " image_data, \n", + " \"image.onnx\", \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input' : {0 : 'batch_size'},\n", + " 'features' : {0 : 'batch_size'},\n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "from onnxconverter_common import float16\n", + "\n", + "module = onnx.load(\"image.onnx\")\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, \"image.onnx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx" ] } ], @@ -297,7 +486,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py similarity index 100% rename from python/scripts/test_generative.py rename to python/scripts/test_decoders.py diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_encoders.py similarity index 87% rename from python/scripts/test_embeddings.py rename to python/scripts/test_encoders.py index d71bf0b..e7541c1 100644 --- a/python/scripts/test_embeddings.py +++ b/python/scripts/test_encoders.py @@ -1,4 +1,5 @@ from typing import Tuple +import os import pytest from PIL import Image @@ -21,6 +22,7 @@ onnx_available = False torch_models = [ + "unum-cloud/uform2-vl-english-small", "unum-cloud/uform-vl-english", "unum-cloud/uform-vl-multilingual-v2", ] @@ -34,11 +36,20 @@ ("unum-cloud/uform-vl-english-large", "gpu", "fp16"), ] +# Let's check if the HuggingFace Hub API token is set in the environment variable. +# If it's not there, check if the `.hf_token` file is present in the current working directory. +token = os.getenv("HUGGINGFACE_HUB_TOKEN", None) +if token is None: + token_path = "./.hf_token" + if os.path.exists(token_path): + with open(token_path, "r") as file: + token = file.read().strip() + @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") @pytest.mark.parametrize("model_name", torch_models) def test_torch_one_embedding(model_name: str): - model, processor = uform.get_model(model_name) + model, processor = uform.get_model(model_name, token=token) text = "a small red panda in a zoo" image_path = "assets/unum.png" @@ -67,7 +78,7 @@ def test_torch_one_embedding(model_name: str): @pytest.mark.parametrize("model_name", torch_models) @pytest.mark.parametrize("batch_size", [1, 2]) def test_torch_many_embeddings(model_name: str, batch_size: int): - model, processor = uform.get_model(model_name) + model, processor = uform.get_model(model_name, token=token) texts = ["a small red panda in a zoo"] * batch_size image_paths = ["assets/unum.png"] * batch_size @@ -90,7 +101,7 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]): try: - model, processor = uform.get_model_onnx(*model_specs) + model, processor = uform.get_model_onnx(*model_specs, token=token) text = "a small red panda in a zoo" image_path = "assets/unum.png" @@ -126,7 +137,7 @@ def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int try: - model, processor = uform.get_model_onnx(*model_specs) + model, processor = uform.get_model_onnx(*model_specs, token=token) texts = ["a small red panda in a zoo"] * batch_size image_paths = ["assets/unum.png"] * batch_size diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 1ecb242..f5a15c2 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -1,40 +1,90 @@ from json import load -from os.path import join +from os.path import join, exists from typing import Mapping, Optional, Tuple +from enum import Enum from huggingface_hub import snapshot_download -def get_checkpoint(model_name: str, token: str) -> Tuple[str, Mapping, str]: - import torch - - model_path = snapshot_download(repo_id=model_name, token=token) - config_path = join(model_path, "torch_config.json") +class Modality(Enum): + TEXT_ENCODER = "text_encoder" + IMAGE_ENCODER = "image_encoder" - state = torch.load(join(model_path, "torch_weight.pt")) - return config_path, state, join(model_path, "tokenizer.json") +def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]: + import torch -def get_model(model_name: str, token: Optional[str] = None): - from uform.torch_models import VLM - from uform.torch_preprocessor import TorchProcessor - - config_path, state, tokenizer_path = get_checkpoint(model_name, token) + # It is not recommended to use `.pth` extension when checkpointing models + # because it collides with Python path (`.pth`) configuration files. + merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"] + separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities] + config_names = ["torch_config.json", "config.json"] + tokenizer_names = ["tokenizer.json"] + + # The download stats depend on the number of times the `config.json` is pulled + # https://huggingface.co/docs/hub/models-download-stats + model_path = snapshot_download( + repo_id=model_name, + token=token, + allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names, + ) + + # Find the first name in `config_names` that is present + config_path = None + for config_name in config_names: + if exists(join(model_path, config_name)): + config_path = join(model_path, config_name) + break + + # Same for the tokenizer + tokenizer_path = None + for tokenizer_name in tokenizer_names: + if exists(join(model_path, tokenizer_name)): + tokenizer_path = join(model_path, tokenizer_name) + break + + # Ideally, we want to separately fetch all the models. + # If those aren't available, aggregate separate modalities and merge them. + state = None + for file_name in merged_model_names: + if exists(join(model_path, file_name)): + state = torch.load(join(model_path, file_name)) + break + + if state is None: + state = {} + for file_name in separate_modality_names: + if exists(join(model_path, file_name)): + modality_name, _, _ = file_name.partition(".") + property_name = modality_name + "_encoder" + state[property_name] = torch.load(join(model_path, file_name)) + + return config_path, state, tokenizer_path + + +def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None): + from python.uform.torch_encoders import TextVisualEncoder + from python.uform.torch_processors import TorchProcessor + + if modalities is None: + modalities = (Modality.TEXT, Modality.IMAGE) + + config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities) with open(config_path) as f: config = load(f) - model = VLM(config, tokenizer_path) - model.image_encoder.load_state_dict(state["image_encoder"]) - model.text_encoder.load_state_dict(state["text_encoder"]) + model = TextVisualEncoder(config, tokenizer_path) + model.image_encoder.load_state_dict(state.get("image_encoder", None)) + model.text_encoder.load_state_dict(state.get("text_encoder", None)) processor = TorchProcessor(config, tokenizer_path) return model.eval(), processor def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None): - from uform.onnx_models import VLM_ONNX - from uform.numpy_preprocessor import NumPyProcessor + from python.uform.onnx_encoders import TextVisualEncoder + from python.uform.numpy_processors import NumPyProcessor assert device in ( "cpu", @@ -53,7 +103,7 @@ def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str with open(join(model_path, "config.json")) as f: config = load(f) - model = VLM_ONNX(model_path, config, device, dtype) + model = TextVisualEncoder(model_path, config, device, dtype) processor = NumPyProcessor(config, join(model_path, "tokenizer.json")) return model, processor diff --git a/python/uform/chat.py b/python/uform/chat.py index 5ef44b7..c9f8dc3 100644 --- a/python/uform/chat.py +++ b/python/uform/chat.py @@ -5,7 +5,7 @@ from PIL import Image from transformers import TextStreamer -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor EOS_TOKEN = 32001 diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py index c03b6eb..6792120 100644 --- a/python/uform/gen_model.py +++ b/python/uform/gen_model.py @@ -1,464 +1 @@ -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -from torch import nn -from torchvision.transforms import (CenterCrop, Compose, InterpolationMode, - Normalize, RandomResizedCrop, Resize, - ToTensor) -from transformers import AutoConfig, AutoTokenizer -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_outputs import CausalLMOutputWithPast -from transformers.modeling_utils import PreTrainedModel -from transformers.models.auto.modeling_auto import (AutoModel, - AutoModelForCausalLM) -from transformers.processing_utils import ProcessorMixin -from transformers.tokenization_utils_base import BatchEncoding - -from uform.torch_models import VisualEncoder - -IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) -IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) - - -def convert_to_rgb(image): - return image.convert("RGB") - - -class LayerScale(nn.Module): - def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False): - super().__init__() - self.weight = nn.Parameter(init_values * torch.ones(dim)) - self.inplace = inplace - - def forward(self, x): - return x.mul_(self.weight) if self.inplace else x * self.weight - - -class ImageFeaturesPooler(nn.Module): - def __init__( - self, - input_size, - hidden_size, - num_attn_heads, - intermediate_size, - num_latents, - initializer_range, - ): - super().__init__() - self.projection = nn.Linear(input_size, hidden_size) - - self.pooler = nn.TransformerDecoderLayer( - hidden_size, - num_attn_heads, - intermediate_size, - activation=nn.functional.silu, - batch_first=True, - norm_first=True, - ) - self.image_latents = nn.Parameter( - torch.randn(1, num_latents, hidden_size) * initializer_range**0.5, - ) - - def forward(self, features): - features = self.projection(features) - return self.pooler( - self.image_latents.expand(features.shape[0], -1, -1), - features, - ) - - -class VLMConfig(PretrainedConfig): - model_type = "vlm" - - def __init__( - self, - text_decoder_name_or_path: str = "", - tokenizer_name_or_path: str = "", - image_size: int = 224, - image_encoder_hidden_size: int = 768, - image_encoder_patch_size: int = 16, - image_encoder_num_layers: int = 12, - image_encoder_num_heads: int = 12, - image_encoder_embedding_dim: int = 256, - image_encoder_pooling: str = "cls", - image_pooler_num_attn_heads: int = 16, - image_pooler_intermediate_size: int = 5504, - image_pooler_num_latents: int = 196, - image_token_id: int = 32002, - initializer_range: float = 0.02, - use_cache: bool = True, - center_crop: bool = True, - **kwargs, - ): - self.text_decoder_name_or_path = text_decoder_name_or_path - self.tokenizer_name_or_path = tokenizer_name_or_path - - self.image_size = image_size - self.image_encoder_hidden_size = image_encoder_hidden_size - self.image_encoder_patch_size = image_encoder_patch_size - self.image_encoder_num_layers = image_encoder_num_layers - self.image_encoder_num_heads = image_encoder_num_heads - self.image_encoder_embedding_dim = image_encoder_embedding_dim - self.image_encoder_pooling = image_encoder_pooling - - self.image_pooler_num_attn_heads = image_pooler_num_attn_heads - self.image_pooler_intermediate_size = image_pooler_intermediate_size - self.image_pooler_num_latents = image_pooler_num_latents - - self.image_token_id = image_token_id - - self.initializer_range = initializer_range - self.use_cache = use_cache - self.center_crop = center_crop - - super().__init__(**kwargs) - - -class VLMPreTrainedModel(PreTrainedModel): - config_class = VLMConfig - base_model_prefix = "vlm" - supports_gradient_checkpointing = True - _no_split_modules = [] - _skip_keys_device_placement = "past_key_values" - - def _init_weights(self, module): - pass - - def _initialize_weights(self, module): - pass - - -class VLMForCausalLM(VLMPreTrainedModel): - def __init__(self, config: VLMConfig): - super().__init__(config) - - self.config = config - self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path) - self.text_config.vocab_size += 3 - self.text_decoder = AutoModelForCausalLM.from_config(self.text_config) - - self.image_encoder = VisualEncoder( - self.config.image_encoder_hidden_size, - self.config.image_encoder_patch_size, - self.config.image_size, - self.config.image_encoder_num_layers, - self.config.image_encoder_num_heads, - self.config.image_encoder_embedding_dim, - self.config.image_encoder_pooling, - ) - - # replace models' layerscales because `transformers` automatically renames keys in state_dict - for i in range(len(self.image_encoder.blocks)): - self.image_encoder.blocks[i].ls1 = LayerScale( - self.image_encoder.blocks[i].ls1.dim, - ) - self.image_encoder.blocks[i].ls2 = LayerScale( - self.image_encoder.blocks[i].ls2.dim, - ) - - self.image_pooler = ImageFeaturesPooler( - self.config.image_encoder_hidden_size, - self.text_config.hidden_size, - self.config.image_pooler_num_attn_heads, - self.config.image_pooler_intermediate_size, - self.config.image_pooler_num_latents, - self.config.initializer_range, - ) - - def get_input_embeddings(self): - return self.text_decoder.get_input_embeddings() - - def set_input_embeddings(self, value): - self.text_decoder.set_input_embeddings(value) - - def get_images_embeddings(self, images): - features = self.image_encoder.forward_features(images) - return self.image_pooler(features) - - def gather_continuous_embeddings( - self, - input_ids: torch.Tensor, - word_embeddings: torch.Tensor, - image_embeddings: torch.Tensor, - ) -> torch.Tensor: - start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1] - embeddings = [] - - for sample_idx, start_idx in enumerate(start_indices.tolist()): - embeddings.append( - torch.cat( - ( - word_embeddings[sample_idx, :start_idx], - image_embeddings[sample_idx], - word_embeddings[sample_idx, start_idx + 1 :], - ), - dim=0, - ), - ) - - return torch.stack(embeddings, dim=0) - - def forward( - self, - input_ids: torch.LongTensor = None, - images: torch.Tensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[dict, Tuple, CausalLMOutputWithPast]: - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) - - if input_ids is not None and inputs_embeds is not None: - raise ValueError( - "You cannot specify both input_ids and inputs_embeds at the same time", - ) - elif input_ids is None and inputs_embeds is None: - raise ValueError("You have to specify either input_is or inputs_embeds") - - if inputs_embeds is None and past_key_values is None: - inputs_embeds = self.get_input_embeddings()(input_ids) - - if images is not None: - image_embeds = self.get_images_embeddings(images) - inputs_embeds = self.gather_continuous_embeddings( - input_ids, - inputs_embeds, - image_embeds, - ) - - if position_ids is None: - seq_length = ( - inputs_embeds.shape[1] - if inputs_embeds is not None - else input_ids.shape[1] - ) - past_key_values_length = 0 - - if past_key_values is not None: - past_key_values_length = past_key_values[0][0].shape[2] - - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, - seq_length + past_key_values_length, - dtype=torch.long, - device=device, - ) - position_ids = position_ids.unsqueeze(0) - - outputs = self.text_decoder( - inputs_embeds=inputs_embeds, - input_ids=input_ids if past_key_values is not None else None, - attention_mask=attention_mask, - labels=labels, - position_ids=position_ids, - past_key_values=past_key_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - use_cache=use_cache, - return_dict=return_dict, - ) - - return outputs - - def prepare_inputs_for_generation( - self, - input_ids, - images=None, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - **kwargs, - ): - if past_key_values: - input_ids = input_ids[:, -1:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - if images is not None: - model_inputs["images"] = images - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - "images": images if past_key_values is None else None, - }, - ) - return model_inputs - - @classmethod - def from_config(cls, config, **kwargs): - return cls._from_config(config, **kwargs) - - -class VLMProcessor(ProcessorMixin): - def __init__(self, config, **kwargs): - self.feature_extractor = None - self.config = config - - if config.center_crop: - self.image_processor = Compose( - [ - Resize(256, interpolation=InterpolationMode.BICUBIC), - CenterCrop(config.image_size), - convert_to_rgb, - ToTensor(), - Normalize( - mean=IMAGENET_MEAN, - std=IMAGENET_STD, - ), - ], - ) - else: - self.image_processor = Compose( - [ - RandomResizedCrop( - config.image_size, - scale=(0.8, 1), - interpolation=InterpolationMode.BICUBIC, - ), - convert_to_rgb, - ToTensor(), - Normalize( - mean=IMAGENET_MEAN, - std=IMAGENET_STD, - ), - ], - ) - - self.tokenizer = AutoTokenizer.from_pretrained( - config.tokenizer_name_or_path, - additional_special_tokens=["<|im_end|>"], - ) - self.num_image_latents = config.image_pooler_num_latents - - def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs): - if texts is not None: - if isinstance(texts, str): - texts = [texts] - - tokenized_texts = [] - for text in texts: - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": f" {text}"}, - ] - tokenized_prompt = self.tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors=return_tensors, - ) - - tokenized_texts.append(tokenized_prompt) - - max_len = max(len(t[0]) for t in tokenized_texts) - input_ids = torch.full( - (len(tokenized_texts), max_len), - fill_value=self.tokenizer.pad_token_id, - dtype=torch.int64, - ) - attention_mask = torch.full( - (len(tokenized_texts), max_len), - fill_value=0, - dtype=torch.int64, - ) - - for i, tokens in enumerate(tokenized_texts): - input_ids[i, -len(tokens[0]) :] = tokens[0] - attention_mask[i, -len(tokens[0]) :] = 1 - - attention_mask = F.pad( - attention_mask, - pad=(0, self.num_image_latents - 1), - value=1, - ) - - encoding = BatchEncoding( - data={"input_ids": input_ids, "attention_mask": attention_mask}, - ) - - if images is not None: - if isinstance(images, (list, tuple)): - image_features = torch.empty( - (len(images), 3, self.config.image_size, self.config.image_size), - dtype=torch.float32, - ) - - for i, image in enumerate(images): - image_features[i] = self.image_processor(image) - else: - image_features = self.image_processor(images).unsqueeze(0) - - if texts is not None and images is not None: - encoding["images"] = image_features - return encoding - - if texts is not None: - return encoding - - return BatchEncoding( - data={ - "images": image_features, - }, - tensor_type=return_tensors, - ) - - def batch_decode(self, *args, **kwargs): - return self.tokenizer.batch_decode(*args, **kwargs) - - def decode(self, *args, **kwargs): - return self.tokenizer.decode(*args, **kwargs) - - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path, - cache_dir=None, - force_download: bool = False, - local_files_only: bool = False, - token=None, - revision: str = "main", - **kwargs, - ): - config = AutoConfig.from_pretrained(pretrained_model_name_or_path) - return cls(config) - - -AutoConfig.register("vlm", VLMConfig) -AutoModel.register(VLMConfig, VLMForCausalLM) +from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_processors.py similarity index 100% rename from python/uform/numpy_preprocessor.py rename to python/uform/numpy_processors.py diff --git a/python/uform/onnx_models.py b/python/uform/onnx_encoders.py similarity index 97% rename from python/uform/onnx_models.py rename to python/uform/onnx_encoders.py index 8e2a87a..68255de 100644 --- a/python/uform/onnx_models.py +++ b/python/uform/onnx_encoders.py @@ -23,7 +23,7 @@ def available_providers(device: str) -> Tuple[str, ...]: return cpu_providers -class VisualEncoderONNX: +class VisualEncoder: def __init__(self, model_path: str, device: str): """ :param model_path: Path to onnx model @@ -43,7 +43,7 @@ def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]: return self.session.run(None, {"images": images}) -class TextEncoderONNX: +class TextEncoder: def __init__(self, text_encoder_path: str, reranker_path: str, device: str): """ :param text_encoder_path: Path to onnx of text encoder @@ -82,7 +82,7 @@ def forward_multimodal( ) -class VLM_ONNX: +class TextVisualEncoder: def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str): assert device in ( "cpu", @@ -103,13 +103,13 @@ def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str): self._text_encoder_dim = config["text_encoder"]["dim"] self._image_encoder_dim = config["image_encoder"]["dim"] - self.text_encoder = TextEncoderONNX( + self.text_encoder = TextEncoder( join(checkpoint_path, f"text_encoder.onnx"), join(checkpoint_path, f"reranker.onnx"), device, ) - self.image_encoder = VisualEncoderONNX(join(checkpoint_path, f"image_encoder.onnx"), device) + self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device) def encode_image( self, @@ -229,3 +229,6 @@ def embedding_dim(self) -> int: def multimodal_embedding_dim(self) -> int: """Dimensionality of multimodal joint embedding.""" return self._text_encoder_dim + + +VLM_ONNX = TextVisualEncoder # legacy diff --git a/python/uform/preprocessing.py b/python/uform/preprocessing.py deleted file mode 100644 index d3d833e..0000000 --- a/python/uform/preprocessing.py +++ /dev/null @@ -1,105 +0,0 @@ -from os import PathLike -from typing import Dict, List, Union - -import torch -from PIL import Image -from tokenizers import Tokenizer -from torch import Tensor -from torchvision.transforms import (CenterCrop, Compose, InterpolationMode, - Normalize, Resize, ToTensor) - - -# lambda is not pickable -def convert_to_rgb(image): - return image.convert("RGB") - - -class Processor: - def __init__(self, config: Dict, tokenizer_path: PathLike, tensor_type: str = "pt"): - """ - :param config: model config - :param tokenizer_path: path to tokenizer file - :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) - """ - - assert tensor_type in ("pt", "np"), "`tensor_type` must be either `pt` or `np`" - - self._image_size = config["image_encoder"]["image_size"] - self._max_seq_len = config["text_encoder"]["max_position_embeddings"] - self._tokenizer = Tokenizer.from_file(tokenizer_path) - self._tokenizer.no_padding() - self._pad_token_idx = config["text_encoder"]["padding_idx"] - - self.tensor_type = tensor_type - - self._image_transform = Compose( - [ - Resize(self._image_size, interpolation=InterpolationMode.BICUBIC), - convert_to_rgb, - CenterCrop(self._image_size), - ToTensor(), - Normalize( - mean=(0.48145466, 0.4578275, 0.40821073), - std=(0.26862954, 0.26130258, 0.27577711), - ), - ], - ) - - def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: - """Transforms one or more strings into dictionary with tokenized strings and attention masks. - - :param texts: text of list of texts to tokenizer - """ - if isinstance(texts, str): - texts = [texts] - - input_ids = torch.full( - (len(texts), self._max_seq_len), - fill_value=self._pad_token_idx, - dtype=torch.int64, - ) - - attention_mask = torch.zeros( - len(texts), - self._max_seq_len, - dtype=torch.int32, - ) - encoded = self._tokenizer.encode_batch(texts) - - for i, seq in enumerate(encoded): - seq_len = min(len(seq), self._max_seq_len) - input_ids[i, :seq_len] = torch.LongTensor( - seq.ids[: self._max_seq_len], - ) - attention_mask[i, :seq_len] = 1 - - if self.tensor_type == "np": - return { - "input_ids": input_ids.numpy(), - "attention_mask": attention_mask.numpy(), - } - - return {"input_ids": input_ids, "attention_mask": attention_mask} - - def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor: - """Transforms one or more Pillow images into Torch Tensors. - - :param images: image or list of images to preprocess - """ - - if isinstance(images, list): - batch_images = torch.empty( - (len(images), 3, self._image_size, self._image_size), - dtype=torch.float32, - ) - - for i, image in enumerate(images): - batch_images[i] = self._image_transform(image) - - else: - batch_images = self._image_transform(images).unsqueeze(0) - - if self.tensor_type == "np": - return batch_images.numpy() - - return batch_images diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py new file mode 100644 index 0000000..79b058d --- /dev/null +++ b/python/uform/torch_decoders.py @@ -0,0 +1,457 @@ +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from torch import nn +from torchvision.transforms import ( + CenterCrop, + Compose, + InterpolationMode, + Normalize, + RandomResizedCrop, + Resize, + ToTensor, +) +from transformers import AutoConfig, AutoTokenizer +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM +from transformers.processing_utils import ProcessorMixin +from transformers.tokenization_utils_base import BatchEncoding + +from uform.torch_encoders import VisualEncoder + +IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) +IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) + + +def convert_to_rgb(image): + return image.convert("RGB") + + +class LayerScale(nn.Module): + def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False): + super().__init__() + self.weight = nn.Parameter(init_values * torch.ones(dim)) + self.inplace = inplace + + def forward(self, x): + return x.mul_(self.weight) if self.inplace else x * self.weight + + +class ImageFeaturesPooler(nn.Module): + def __init__( + self, + input_size, + hidden_size, + num_attn_heads, + intermediate_size, + num_latents, + initializer_range, + ): + super().__init__() + self.projection = nn.Linear(input_size, hidden_size) + + self.pooler = nn.TransformerDecoderLayer( + hidden_size, + num_attn_heads, + intermediate_size, + activation=nn.functional.silu, + batch_first=True, + norm_first=True, + ) + self.image_latents = nn.Parameter( + torch.randn(1, num_latents, hidden_size) * initializer_range**0.5, + ) + + def forward(self, features): + features = self.projection(features) + return self.pooler( + self.image_latents.expand(features.shape[0], -1, -1), + features, + ) + + +class VLMConfig(PretrainedConfig): + model_type = "vlm" + + def __init__( + self, + text_decoder_name_or_path: str = "", + tokenizer_name_or_path: str = "", + image_size: int = 224, + image_encoder_hidden_size: int = 768, + image_encoder_patch_size: int = 16, + image_encoder_num_layers: int = 12, + image_encoder_num_heads: int = 12, + image_encoder_embedding_dim: int = 256, + image_encoder_pooling: str = "cls", + image_pooler_num_attn_heads: int = 16, + image_pooler_intermediate_size: int = 5504, + image_pooler_num_latents: int = 196, + image_token_id: int = 32002, + initializer_range: float = 0.02, + use_cache: bool = True, + center_crop: bool = True, + **kwargs, + ): + self.text_decoder_name_or_path = text_decoder_name_or_path + self.tokenizer_name_or_path = tokenizer_name_or_path + + self.image_size = image_size + self.image_encoder_hidden_size = image_encoder_hidden_size + self.image_encoder_patch_size = image_encoder_patch_size + self.image_encoder_num_layers = image_encoder_num_layers + self.image_encoder_num_heads = image_encoder_num_heads + self.image_encoder_embedding_dim = image_encoder_embedding_dim + self.image_encoder_pooling = image_encoder_pooling + + self.image_pooler_num_attn_heads = image_pooler_num_attn_heads + self.image_pooler_intermediate_size = image_pooler_intermediate_size + self.image_pooler_num_latents = image_pooler_num_latents + + self.image_token_id = image_token_id + + self.initializer_range = initializer_range + self.use_cache = use_cache + self.center_crop = center_crop + + super().__init__(**kwargs) + + +class VLMPreTrainedModel(PreTrainedModel): + config_class = VLMConfig + base_model_prefix = "vlm" + supports_gradient_checkpointing = True + _no_split_modules = [] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + pass + + def _initialize_weights(self, module): + pass + + +class VLMForCausalLM(VLMPreTrainedModel): + def __init__(self, config: VLMConfig): + super().__init__(config) + + self.config = config + self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path) + self.text_config.vocab_size += 3 + self.text_decoder = AutoModelForCausalLM.from_config(self.text_config) + + self.image_encoder = VisualEncoder( + self.config.image_encoder_hidden_size, + self.config.image_encoder_patch_size, + self.config.image_size, + self.config.image_encoder_num_layers, + self.config.image_encoder_num_heads, + self.config.image_encoder_embedding_dim, + self.config.image_encoder_pooling, + ) + + # replace models' layerscales because `transformers` automatically renames keys in state_dict + for i in range(len(self.image_encoder.blocks)): + self.image_encoder.blocks[i].ls1 = LayerScale( + self.image_encoder.blocks[i].ls1.dim, + ) + self.image_encoder.blocks[i].ls2 = LayerScale( + self.image_encoder.blocks[i].ls2.dim, + ) + + self.image_pooler = ImageFeaturesPooler( + self.config.image_encoder_hidden_size, + self.text_config.hidden_size, + self.config.image_pooler_num_attn_heads, + self.config.image_pooler_intermediate_size, + self.config.image_pooler_num_latents, + self.config.initializer_range, + ) + + def get_input_embeddings(self): + return self.text_decoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_decoder.set_input_embeddings(value) + + def get_images_embeddings(self, images): + features = self.image_encoder.forward_features(images) + return self.image_pooler(features) + + def gather_continuous_embeddings( + self, + input_ids: torch.Tensor, + word_embeddings: torch.Tensor, + image_embeddings: torch.Tensor, + ) -> torch.Tensor: + start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1] + embeddings = [] + + for sample_idx, start_idx in enumerate(start_indices.tolist()): + embeddings.append( + torch.cat( + ( + word_embeddings[sample_idx, :start_idx], + image_embeddings[sample_idx], + word_embeddings[sample_idx, start_idx + 1 :], + ), + dim=0, + ), + ) + + return torch.stack(embeddings, dim=0) + + def forward( + self, + input_ids: torch.LongTensor = None, + images: torch.Tensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[dict, Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time", + ) + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_is or inputs_embeds") + + if inputs_embeds is None and past_key_values is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if images is not None: + image_embeds = self.get_images_embeddings(images) + inputs_embeds = self.gather_continuous_embeddings( + input_ids, + inputs_embeds, + image_embeds, + ) + + if position_ids is None: + seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1] + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0) + + outputs = self.text_decoder( + inputs_embeds=inputs_embeds, + input_ids=input_ids if past_key_values is not None else None, + attention_mask=attention_mask, + labels=labels, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=use_cache, + return_dict=return_dict, + ) + + return outputs + + def prepare_inputs_for_generation( + self, + input_ids, + images=None, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + if images is not None: + model_inputs["images"] = images + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "images": images if past_key_values is None else None, + }, + ) + return model_inputs + + @classmethod + def from_config(cls, config, **kwargs): + return cls._from_config(config, **kwargs) + + +class VLMProcessor(ProcessorMixin): + def __init__(self, config, **kwargs): + self.feature_extractor = None + self.config = config + + if config.center_crop: + self.image_processor = Compose( + [ + Resize(256, interpolation=InterpolationMode.BICUBIC), + CenterCrop(config.image_size), + convert_to_rgb, + ToTensor(), + Normalize( + mean=IMAGENET_MEAN, + std=IMAGENET_STD, + ), + ], + ) + else: + self.image_processor = Compose( + [ + RandomResizedCrop( + config.image_size, + scale=(0.8, 1), + interpolation=InterpolationMode.BICUBIC, + ), + convert_to_rgb, + ToTensor(), + Normalize( + mean=IMAGENET_MEAN, + std=IMAGENET_STD, + ), + ], + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + config.tokenizer_name_or_path, + additional_special_tokens=["<|im_end|>"], + ) + self.num_image_latents = config.image_pooler_num_latents + + def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs): + if texts is not None: + if isinstance(texts, str): + texts = [texts] + + tokenized_texts = [] + for text in texts: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": f" {text}"}, + ] + tokenized_prompt = self.tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + return_tensors=return_tensors, + ) + + tokenized_texts.append(tokenized_prompt) + + max_len = max(len(t[0]) for t in tokenized_texts) + input_ids = torch.full( + (len(tokenized_texts), max_len), + fill_value=self.tokenizer.pad_token_id, + dtype=torch.int64, + ) + attention_mask = torch.full( + (len(tokenized_texts), max_len), + fill_value=0, + dtype=torch.int64, + ) + + for i, tokens in enumerate(tokenized_texts): + input_ids[i, -len(tokens[0]) :] = tokens[0] + attention_mask[i, -len(tokens[0]) :] = 1 + + attention_mask = F.pad( + attention_mask, + pad=(0, self.num_image_latents - 1), + value=1, + ) + + encoding = BatchEncoding( + data={"input_ids": input_ids, "attention_mask": attention_mask}, + ) + + if images is not None: + if isinstance(images, (list, tuple)): + image_features = torch.empty( + (len(images), 3, self.config.image_size, self.config.image_size), + dtype=torch.float32, + ) + + for i, image in enumerate(images): + image_features[i] = self.image_processor(image) + else: + image_features = self.image_processor(images).unsqueeze(0) + + if texts is not None and images is not None: + encoding["images"] = image_features + return encoding + + if texts is not None: + return encoding + + return BatchEncoding( + data={ + "images": image_features, + }, + tensor_type=return_tensors, + ) + + def batch_decode(self, *args, **kwargs): + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + return self.tokenizer.decode(*args, **kwargs) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path, + cache_dir=None, + force_download: bool = False, + local_files_only: bool = False, + token=None, + revision: str = "main", + **kwargs, + ): + config = AutoConfig.from_pretrained(pretrained_model_name_or_path) + return cls(config) + + +AutoConfig.register("vlm", VLMConfig) +AutoModel.register(VLMConfig, VLMForCausalLM) diff --git a/python/uform/torch_models.py b/python/uform/torch_encoders.py similarity index 99% rename from python/uform/torch_models.py rename to python/uform/torch_encoders.py index ab86622..4339765 100644 --- a/python/uform/torch_models.py +++ b/python/uform/torch_encoders.py @@ -353,7 +353,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return embeddings -class VLM(nn.Module): +class TextVisualEncoder(nn.Module): """ Vision-Language Model for Multimodal embeddings. """ @@ -364,8 +364,9 @@ def __init__(self, config: Dict, tokenizer_path: PathLike): """ super().__init__() - self._embedding_dim = config["text_encoder"]["embedding_dim"] + config["text_encoder"].pop("tokenizer_class", None) + self._embedding_dim = config["text_encoder"]["embedding_dim"] self.text_encoder = TextEncoder(**config["text_encoder"]) self.image_encoder = VisualEncoder(**config["image_encoder"]) @@ -503,3 +504,6 @@ def embedding_dim(self) -> int: def multimodal_embedding_dim(self) -> int: """Dimensionality of multimodal joint embedding.""" return self.text_encoder.dim + + +VLM = TextVisualEncoder # legacy diff --git a/python/uform/torch_preprocessor.py b/python/uform/torch_processors.py similarity index 100% rename from python/uform/torch_preprocessor.py rename to python/uform/torch_processors.py diff --git a/swift/EmbeddingsTests.swift b/swift/EmbeddingsTests.swift index 5efb87f..889cdb6 100644 --- a/swift/EmbeddingsTests.swift +++ b/swift/EmbeddingsTests.swift @@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase { let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) @@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase { // A better option is to fetch directly from HuggingFace, similar to how users would do that: let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) let imageModel = try await ImageEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) diff --git a/swift/README.md b/swift/README.md new file mode 100644 index 0000000..1eebf29 --- /dev/null +++ b/swift/README.md @@ -0,0 +1,44 @@ +# UForm for Swift + +UForm offers first-party support for Swift. +To get started, add UForm to your project using Swift Package Manager. + +```bash +swift package init --type executable +swift package add uform +``` + +Then, import UForm in your Swift code: + +```swift +import UForm +``` + +## Embeddings + +### Text Embeddings + +```swift +let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie." +let textEmbedding: Embedding = try textModel.forward(with: text) +let textVector: [Float32] = textEmbedding.asFloats() +``` + +### Image Embeddings + +```swift +let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true" +guard let url = URL(string: imageURL), + let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil), + let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) { + throw Exception("Could not load image from URL: \(imageURL)") +} + +var imageEmbedding: Embedding = try imageModel.forward(with: cgImage) +var imageVector: [Float32] = embedding.asFloats() +``` + + +### Computing Distances \ No newline at end of file