diff --git a/.gitignore b/.gitignore index 4db8e17..f4fa33b 100755 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ node_modules *.onnx *.pt *.safetensors +*.mlpackage diff --git a/.vscode/settings.json b/.vscode/settings.json index a6cceb8..5052dea 100755 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,8 +1,10 @@ { "cSpell.words": [ "arange", + "astype", "CFURL", "coreml", + "crossattn", "cumsum", "dtype", "embs", @@ -25,12 +27,17 @@ "pretrained", "probs", "pypi", + "pytest", + "randn", "rerank", "reranker", "reranking", + "sandbeach", "sess", "SIMD", "softmax", + "Tensorrt", + "torchvision", "transfromers", "uform", "unimodal", diff --git a/Package.swift b/Package.swift index 6ac8372..b3b9ffd 100644 --- a/Package.swift +++ b/Package.swift @@ -29,13 +29,13 @@ let package = Package( .product(name: "Transformers", package: "swift-transformers") ], path: "swift", - exclude: ["EmbeddingsTests.swift"] + exclude: ["EncodersTests.swift"] ), .testTarget( name: "UFormTests", dependencies: ["UForm"], path: "swift", - sources: ["EmbeddingsTests.swift"] + sources: ["EncodersTests.swift"] ), ] ) diff --git a/pyproject.toml b/pyproject.toml index 10f7a9b..1a84808 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,8 @@ classifiers = [ dependencies = [ "huggingface_hub>=0.16.4", "tokenizers>=0.13.3", - "pillow" + "pillow", + "simsimd", ] description = "Pocket-Sized Multimodal AI for Content Understanding and Generation" maintainers = [ diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb index df57858..0ec6e8a 100644 --- a/python/scripts/export_encoders.ipynb +++ b/python/scripts/export_encoders.ipynb @@ -4,7 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Scripts for Exporting PyTorch Models to ONNX and CoreML" + "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n", + "\n", + "Depending on the backend, we prefer different qunatization schemes.\n", + "\n", + "- For ONNX we use `int8` quantization.\n", + "- For PyTorch we use `bfloat16` quantization.\n", + "- For CoreML we use `float32` representation." ] }, { @@ -18,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -29,9 +35,47 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n", + " Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n", + " Expected in: <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n", + " warn(f\"Failed to load image Python extension: {e}\")\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d4bf831f84cb4a88bc5fe4aa4487b241", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 3 files: 0%| | 0/3 [00:00 float: + if not isinstance(x, np.ndarray): + x = x.detach().numpy() + if not isinstance(y, np.ndarray): + y = y.detach().numpy() + + # Unlike NumPy, SimSIMD can properly deal with integer types + x = x.astype(np.float32).flatten() + y = y.astype(np.float32).flatten() + return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) + + +def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding): + """Test if the embeddings of text and image are semantically similar + using a small set of example text-image pairs.""" + + texts = [ + "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", + "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", + "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", + "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", + "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", + ] + + image_urls = [ + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", + ] + + text_embeddings = [] + image_embeddings = [] + + for text, image_url in zip(texts, image_urls): + # Download and open the image + response = requests.get(image_url) + image = Image.open(BytesIO(response.content)) + + # Get embeddings + text_embedding = text_to_embedding(text) + image_embedding = image_to_embedding(image) + + text_embeddings.append(text_embedding) + image_embeddings.append(image_embedding) + + # Evaluate cosine similarity + for i in range(len(texts)): + pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i]) + other_text_similarities = [ + cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(len(texts)) if j != i + ] + other_image_similarities = [ + cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(len(texts)) if j != i + ] + + assert pair_similarity > max( + other_text_similarities + ), "Text should be more similar to its corresponding image than to other images." + assert pair_similarity > max( + other_image_similarities + ), "Image should be more similar to its corresponding text than to other texts." + + @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") @pytest.mark.parametrize("model_name", torch_models) def test_torch_one_embedding(model_name: str): @@ -73,6 +137,12 @@ def test_torch_one_embedding(model_name: str): assert score.shape[0] == 1, "Matching score batch size is not 1" assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1" + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings( + lambda text: model.encode_text(processor.preprocess_text(text)), + lambda image: model.encode_image(processor.preprocess_image(image)), + ) + @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") @pytest.mark.parametrize("model_name", torch_models) @@ -94,14 +164,15 @@ def test_torch_many_embeddings(model_name: str, batch_size: int): @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") -@pytest.mark.parametrize("model_specs", onnx_models_and_providers) -def test_onnx_one_embedding(model_specs: Tuple[str, str, str]): +@pytest.mark.parametrize("model_name", onnx_models) +@pytest.mark.parametrize("device", ["CPUExecutionProvider"]) +def test_onnx_one_embedding(model_name: str, device: str): - from uform.onnx_models import ExecutionProviderError + from uform.onnx_encoders import ExecutionProviderError try: - model, processor = uform.get_model_onnx(*model_specs, token=token) + model, processor = uform.get_model_onnx(model_name, token=token, device=device) text = "a small red panda in a zoo" image_path = "assets/unum.png" @@ -115,29 +186,27 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]): assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" - score, joint_embedding = model.encode_multimodal( - image_features=image_features, - text_features=text_features, - attention_mask=text_data["attention_mask"], - return_scores=True, + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings( + lambda text: model.encode_text(processor.preprocess_text(text)), + lambda image: model.encode_image(processor.preprocess_image(image)), ) - assert score.shape[0] == 1, "Matching score batch size is not 1" - assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1" except ExecutionProviderError as e: pytest.skip(f"Execution provider error: {e}") @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") -@pytest.mark.parametrize("model_specs", onnx_models_and_providers) +@pytest.mark.parametrize("model_name", onnx_models) @pytest.mark.parametrize("batch_size", [1, 2]) -def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int): +@pytest.mark.parametrize("device", ["CPUExecutionProvider"]) +def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str): - from uform.onnx_models import ExecutionProviderError + from uform.onnx_encoders import ExecutionProviderError try: - model, processor = uform.get_model_onnx(*model_specs, token=token) + model, processor = uform.get_model_onnx(model_name, token=token, device=device) texts = ["a small red panda in a zoo"] * batch_size image_paths = ["assets/unum.png"] * batch_size diff --git a/python/uform/__init__.py b/python/uform/__init__.py index f5a15c2..44fce13 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -1,6 +1,6 @@ from json import load from os.path import join, exists -from typing import Mapping, Optional, Tuple +from typing import Dict, Optional, Tuple, Literal from enum import Enum from huggingface_hub import snapshot_download @@ -9,15 +9,38 @@ class Modality(Enum): TEXT_ENCODER = "text_encoder" IMAGE_ENCODER = "image_encoder" + VIDEO_ENCODER = "video_encoder" + TEXT_DECODER = "text_decoder" -def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]: - import torch +def normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]: + if modalities is None: + return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER) + + return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities) + + +def get_checkpoint( + model_name: str, + modalities: Tuple[str, Modality], + token: Optional[str] = None, + format: Literal[".pt", ".onnx"] = ".pt", +) -> Tuple[str, Dict[Modality, str], Optional[str]]: + """Downloads a model checkpoint from the Hugging Face Hub. + + :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small` + :param token: The Hugging Face API token, if required + :param modalities: The modalities to download, like `("text_encoder", "image_encoder")` + :param format: The format of the model checkpoint, either `.pt` or `.onnx` + :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path + """ + + modalities = normalize_modalities(modalities) # It is not recommended to use `.pth` extension when checkpointing models # because it collides with Python path (`.pth`) configuration files. - merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"] - separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities] + merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]] + separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities] config_names = ["torch_config.json", "config.json"] tokenizer_names = ["tokenizer.json"] @@ -45,65 +68,58 @@ def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, # Ideally, we want to separately fetch all the models. # If those aren't available, aggregate separate modalities and merge them. - state = None + modality_paths = None for file_name in merged_model_names: if exists(join(model_path, file_name)): - state = torch.load(join(model_path, file_name)) + modality_paths = join(model_path, file_name) break - if state is None: - state = {} - for file_name in separate_modality_names: - if exists(join(model_path, file_name)): - modality_name, _, _ = file_name.partition(".") - property_name = modality_name + "_encoder" - state[property_name] = torch.load(join(model_path, file_name)) + if modality_paths is None: + modality_paths = {} + for separate_modality_name in separate_modality_names: + if exists(join(model_path, separate_modality_name)): + modality_name, _, _ = separate_modality_name.partition(".") + modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name) - return config_path, state, tokenizer_path + return config_path, modality_paths, tokenizer_path -def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None): - from python.uform.torch_encoders import TextVisualEncoder - from python.uform.torch_processors import TorchProcessor +def get_model( + model_name: str, + *, + token: Optional[str] = None, + modalities: Optional[Tuple[str]] = None, +): + from uform.torch_encoders import TextVisualEncoder + from uform.torch_processors import TorchProcessor - if modalities is None: - modalities = (Modality.TEXT, Modality.IMAGE) - - config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities) - - with open(config_path) as f: - config = load(f) + config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".pt") + modality_paths = ( + {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths + ) - model = TextVisualEncoder(config, tokenizer_path) - model.image_encoder.load_state_dict(state.get("image_encoder", None)) - model.text_encoder.load_state_dict(state.get("text_encoder", None)) - processor = TorchProcessor(config, tokenizer_path) + model = TextVisualEncoder(config_path, modality_paths) + processor = TorchProcessor(config_path, tokenizer_path) return model.eval(), processor -def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None): - from python.uform.onnx_encoders import TextVisualEncoder - from python.uform.numpy_processors import NumPyProcessor +def get_model_onnx( + model_name: str, + *, + device: Literal["cpu", "cuda"] = "cpu", + token: Optional[str] = None, + modalities: Optional[Tuple[str]] = None, +): + from uform.onnx_encoders import TextVisualEncoder + from uform.numpy_processors import NumPyProcessor - assert device in ( - "cpu", - "gpu", - ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`" - assert dtype in ( - "fp32", - "fp16", - ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)" - assert ( - device == "cpu" and dtype == "fp32" - ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported" - - model_path = snapshot_download(repo_id=f"{model_name}-{device}-{dtype}", token=token) - - with open(join(model_path, "config.json")) as f: - config = load(f) + config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".onnx") + modality_paths = ( + {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths + ) - model = TextVisualEncoder(model_path, config, device, dtype) - processor = NumPyProcessor(config, join(model_path, "tokenizer.json")) + model = TextVisualEncoder(config_path, modality_paths, device=device) + processor = NumPyProcessor(config_path, tokenizer_path) return model, processor diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py index a556db4..d300504 100644 --- a/python/uform/numpy_processors.py +++ b/python/uform/numpy_processors.py @@ -1,5 +1,6 @@ from os import PathLike from typing import Dict, List, Union +import json from PIL.Image import Image, BICUBIC from tokenizers import Tokenizer @@ -7,13 +8,14 @@ class NumPyProcessor: - def __init__(self, config: Dict, tokenizer_path: PathLike): + def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ :param config: model config :param tokenizer_path: path to tokenizer file :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ + config = json.load(open(config_path, "r")) self._image_size = config["image_encoder"]["image_size"] self._max_seq_len = config["text_encoder"]["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index 68255de..8201693 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -1,5 +1,6 @@ -from os.path import join -from typing import Dict, Optional, Tuple, Union +from os import PathLike +from typing import Dict, Optional, Tuple, Union, Literal +import json import onnxruntime as ort from numpy import ndarray @@ -9,18 +10,52 @@ class ExecutionProviderError(Exception): """Exception raised when a requested execution provider is not available.""" -def available_providers(device: str) -> Tuple[str, ...]: +def available_providers(device: Optional[str]) -> Tuple[str, ...]: + """Returns a tuple of available execution providers based on the requested device. + https://onnxruntime.ai/docs/execution-providers/ + + :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name. + :return: Tuple of available execution providers. + :raises ExecutionProviderError: If the requested device is not available. + """ + gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider") cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider") available = ort.get_available_providers() - if device == "gpu": + + # If no target device is specified, let's sort all the available ones with respect to our preference + if device is None: + preferences = gpu_providers + cpu_providers + filtered_preferences = tuple(provider for provider in preferences if provider in available) + if len(filtered_preferences): + return filtered_preferences + if len(available): + return available + raise ExecutionProviderError("No execution providers are available") + + # If a GPU is requested, but no GPU providers are available, raise an error + if device == "gpu" or device == "cuda": if all(provider not in available for provider in gpu_providers): raise ExecutionProviderError( f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}" ) return gpu_providers - return cpu_providers + # If a CPU is requested, but no CPU providers are available, raise an error + if device == "cpu": + if all(provider not in available for provider in cpu_providers): + raise ExecutionProviderError( + f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}" + ) + return cpu_providers + + if device not in available: + available_providers = ", ".join(available) + raise ExecutionProviderError( + f"Execution provider {device} is not available. Currently installed: {available_providers}" + ) + + return (device,) class VisualEncoder: @@ -40,11 +75,11 @@ def __init__(self, model_path: str, device: str): ) def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]: - return self.session.run(None, {"images": images}) + return self.session.run(None, {"input": images}) class TextEncoder: - def __init__(self, text_encoder_path: str, reranker_path: str, device: str): + def __init__(self, text_encoder_path: str, device: str): """ :param text_encoder_path: Path to onnx of text encoder :param reranker_path: Path to onnx of reranker @@ -60,56 +95,35 @@ def __init__(self, text_encoder_path: str, reranker_path: str, device: str): providers=available_providers(device), ) - self.reranker_session = ort.InferenceSession( - reranker_path, - sess_options=session_options, - providers=available_providers(device), - ) - def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]: return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask}) - def forward_multimodal( - self, text_features: ndarray, attention_mask: ndarray, image_features: ndarray - ) -> Tuple[ndarray, ndarray]: - return self.reranker_session.run( - None, - { - "text_features": text_features, - "attention_mask": attention_mask, - "image_features": image_features, - }, - ) - class TextVisualEncoder: - def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str): - assert device in ( - "cpu", - "gpu", - ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`" - assert dtype in ( - "fp32", - "fp16", - ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)" - assert ( - device == "cpu" and dtype == "fp32" - ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported" - + def __init__( + self, + config_path: PathLike, + modality_paths: Union[Dict[str, PathLike], PathLike] = None, + *, + device: Literal["cpu", "cuda"] = "cpu", + ): + """Initializes the model with the configuration and pre-trained weights. + + :param config_path: Path to the JSON model configuration file + :param modality_paths: Dictionary with paths to different modalities, + or a single path to the model checkpoint + """ self.device = device - self.dtype = dtype + config = json.load(open(config_path, "r")) self._embedding_dim = config["text_encoder"]["embedding_dim"] self._text_encoder_dim = config["text_encoder"]["dim"] self._image_encoder_dim = config["image_encoder"]["dim"] - self.text_encoder = TextEncoder( - join(checkpoint_path, f"text_encoder.onnx"), - join(checkpoint_path, f"reranker.onnx"), - device, - ) - - self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device) + text_encoder_path = modality_paths.get("text_encoder", None) + image_encoder_path = modality_paths.get("image_encoder", None) + self.text_encoder = TextEncoder(text_encoder_path, device) if text_encoder_path else None + self.image_encoder = VisualEncoder(image_encoder_path, device) if image_encoder_path else None def encode_image( self, @@ -147,51 +161,6 @@ def encode_text( return embeddings - def encode_multimodal( - self, - image: Optional[ndarray] = None, - text: Dict[str, ndarray] = None, - image_features: Optional[ndarray] = None, - text_features: Optional[ndarray] = None, - attention_mask: Optional[ndarray] = None, - return_scores: bool = False, - ) -> Union[ndarray, Tuple[ndarray, ndarray]]: - """Passes preprocessed texts (or precomputed texts features) and - preprocessed images (or precomputed images features) through multimodal encoded to produce matching scores and optionally multimodal joint embeddings. - - :param image: Preprocessed images - :param text: Preprocessed texts - :param image_features: Precomputed images features - :param text_features: Precomputed text features - :param attention_mask: Attention masks, not required if pass `text` instead of text_features - """ - - assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None" - assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None" - - if text_features is not None: - assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`" - - if image_features is None: - image_features = self.image_encoder(image) - - if text_features is None: - text_features = self.text_encoder( - text["input_ids"], - text["attention_mask"], - ) - - matching_scores, embeddings = self.text_encoder.forward_multimodal( - text_features, - attention_mask if attention_mask is not None else text["attention_mask"], - image_features, - ) - - if return_scores: - return matching_scores, embeddings - - return embeddings - def forward( self, images: ndarray, diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index 4339765..2a0a0c9 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from os import PathLike from typing import Dict, Optional, Tuple, Union +import json import torch import torch.nn as nn @@ -358,17 +359,45 @@ class TextVisualEncoder(nn.Module): Vision-Language Model for Multimodal embeddings. """ - def __init__(self, config: Dict, tokenizer_path: PathLike): - """ - :param config: Model config + def __init__( + self, + config_path: PathLike, + modality_paths: Union[Dict[str, PathLike], PathLike] = None, + ): + """Initializes the model with the configuration and pre-trained weights. + + :param config_path: Path to the JSON model configuration file + :param modality_paths: Dictionary with paths to different modalities, + or a single path to the model checkpoint """ super().__init__() - config["text_encoder"].pop("tokenizer_class", None) + config = json.load(open(config_path, "r")) self._embedding_dim = config["text_encoder"]["embedding_dim"] - self.text_encoder = TextEncoder(**config["text_encoder"]) - self.image_encoder = VisualEncoder(**config["image_encoder"]) + + # Both `text_encoder` and `image_encoder` are data-classes, so we must strip + # all the non-member attributes before initializing the classes. + text_fields = TextEncoder.__dataclass_fields__ + image_fields = VisualEncoder.__dataclass_fields__ + text_encoder_attrs = {k: v for k, v in config["text_encoder"].items() if k in text_fields} + image_encoder_attrs = {k: v for k, v in config["image_encoder"].items() if k in image_fields} + self.text_encoder = TextEncoder(**text_encoder_attrs) + self.image_encoder = VisualEncoder(**image_encoder_attrs) + + # Load pre-trained weights + if modality_paths is not None: + if isinstance(modality_paths, Union[PathLike, str]): + state = torch.load(modality_paths) + self.text_encoder.load_state_dict(state["text_encoder"]) + self.image_encoder.load_state_dict(state["image_encoder"]) + else: + text_encoder_path = modality_paths.get("text_encoder", None) + image_encoder_path = modality_paths.get("image_encoder", None) + if text_encoder_path: + self.text_encoder.load_state_dict(torch.load(text_encoder_path)) + if image_encoder_path: + self.image_encoder.load_state_dict(torch.load(image_encoder_path)) def encode_image( self, diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py index 8bdc70b..b435efb 100644 --- a/python/uform/torch_processors.py +++ b/python/uform/torch_processors.py @@ -1,5 +1,6 @@ from os import PathLike from typing import Dict, List, Union +import json import torch from PIL.Image import Image @@ -15,19 +16,20 @@ ) -# lambda is not pickable +# lambda is not pickle-able def convert_to_rgb(image): return image.convert("RGB") class TorchProcessor: - def __init__(self, config: Dict, tokenizer_path: PathLike): + def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ :param config: model config :param tokenizer_path: path to tokenizer file :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ + config = json.load(open(config_path, "r")) self._image_size = config["image_encoder"]["image_size"] self._max_seq_len = config["text_encoder"]["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) diff --git a/swift/Embeddings.swift b/swift/Encoders.swift similarity index 98% rename from swift/Embeddings.swift rename to swift/Encoders.swift index 6d973ac..bc78433 100644 --- a/swift/Embeddings.swift +++ b/swift/Encoders.swift @@ -11,6 +11,17 @@ import Foundation import Hub // `Config` import Tokenizers // `AutoTokenizer` + +enum EncoderError: Error { + case configLoadingError(String) + case modelLoadingError(String) + case unsupportedDataType + case invalidInput + case unsupportedShapeConstraint + case modelPredictionFailed(String) +} + + public enum Embedding { case i32s([Int32]) case f16s([Float16]) diff --git a/swift/EmbeddingsTests.swift b/swift/EncodersTests.swift similarity index 97% rename from swift/EmbeddingsTests.swift rename to swift/EncodersTests.swift index 889cdb6..caab363 100644 --- a/swift/EmbeddingsTests.swift +++ b/swift/EncodersTests.swift @@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase { let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform2-vl-english-small", + modelName: "unum-cloud/uform3-image-text-english-small", hubApi: api ) @@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase { // A better option is to fetch directly from HuggingFace, similar to how users would do that: let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform2-vl-english-small", + modelName: "unum-cloud/uform3-image-text-english-small", hubApi: api ) let imageModel = try await ImageEncoder( - modelName: "unum-cloud/uform2-vl-english-small", + modelName: "unum-cloud/uform3-image-text-english-small", hubApi: api ) diff --git a/swift/README.md b/swift/README.md index 1eebf29..66b531f 100644 --- a/swift/README.md +++ b/swift/README.md @@ -19,7 +19,7 @@ import UForm ### Text Embeddings ```swift -let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small") let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie." let textEmbedding: Embedding = try textModel.forward(with: text) let textVector: [Float32] = textEmbedding.asFloats() @@ -28,7 +28,7 @@ let textVector: [Float32] = textEmbedding.asFloats() ### Image Embeddings ```swift -let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform3-image-text-english-small") let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true" guard let url = URL(string: imageURL), let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),