diff --git a/.vscode/settings.json b/.vscode/settings.json index 3a060e1..3275f93 100755 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -21,7 +21,9 @@ "ndarray", "numpy", "ONNX", + "onnxconverter", "onnxruntime", + "opset", "packbits", "preprocess", "pretrained", @@ -48,5 +50,8 @@ "[python]": { "editor.defaultFormatter": "ms-python.black-formatter" }, - "python.formatting.provider": "none" + "python.formatting.provider": "none", + "window.autoDetectColorScheme": true, + "workbench.colorTheme": "Default Dark+", + "workbench.preferredDarkColorTheme": "Default Dark+" } \ No newline at end of file diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb index 029e60a..a8b868d 100644 --- a/python/scripts/export_encoders.ipynb +++ b/python/scripts/export_encoders.ipynb @@ -19,7 +19,6 @@ "metadata": {}, "outputs": [], "source": [ - "!pip uninstall -y uform\n", "!pip install --upgrade \"uform[torch]\" coremltools" ] }, @@ -30,8 +29,13 @@ "outputs": [], "source": [ "import os\n", - "model_name = \"uform-vl-english-small\"\n", - "output_directory = \"../../\"" + "\n", + "working_directory = \"../..\"\n", + "model_name = \"uform3-image-text-english-small\"\n", + "model_directory = os.path.join(working_directory, \"models\", model_name)\n", + "model_weights_path = os.path.join(model_directory, \"torch_weight.pt\")\n", + "config_path = os.path.join(model_directory, \"config.json\")\n", + "tokenizer_path = os.path.join(model_directory, \"tokenizer.json\")" ] }, { @@ -40,20 +44,20 @@ "metadata": {}, "outputs": [], "source": [ - "import uform\n", - "from PIL import Image\n", - "\n", - "model, processor = uform.get_model('unum-cloud/' + model_name)\n", - "text = 'a small red panda in a zoo'\n", - "image = Image.open('../../assets/unum.png')\n", - "\n", - "image_data = processor.preprocess_image(image)\n", - "text_data = processor.preprocess_text(text)\n", - "\n", - "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n", - "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "import torch\n", "\n", - "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + "state_dict = torch.load(model_weights_path)\n", + "list(state_dict.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from uform.torch_encoders import ImageEncoder, TextEncoder\n", + "from uform.torch_processors import ImageProcessor, TextProcessor" ] }, { @@ -62,7 +66,9 @@ "metadata": {}, "outputs": [], "source": [ - "model.text_encoder" + "image_encoder = ImageEncoder.from_pretrained(config_path, state_dict)\n", + "text_encoder = TextEncoder.from_pretrained(config_path, state_dict)\n", + "image_encoder, text_encoder" ] }, { @@ -71,7 +77,9 @@ "metadata": {}, "outputs": [], "source": [ - "model.image_encoder" + "text_processor = TextProcessor(config_path, tokenizer_path)\n", + "image_processor = ImageProcessor(config_path)\n", + "text_processor, image_processor" ] }, { @@ -80,14 +88,19 @@ "metadata": {}, "outputs": [], "source": [ - "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n", - "for name, module in model.image_encoder.named_children():\n", - " print(f\"First layer of image_encoder: {name}\")\n", - " break # We break after the first layer\n", + "import uform\n", + "from PIL import Image\n", "\n", - "for name, module in model.text_encoder.named_children():\n", - " print(f\"First layer of text_encoder: {name}\")\n", - " break # We break after the first layer" + "text = 'a small red panda in a zoo'\n", + "image = Image.open('../../assets/unum.png')\n", + "\n", + "text_data = text_processor(text)\n", + "image_data = image_processor(image)\n", + "\n", + "image_features, image_embedding = image_encoder.forward(image_data, return_features=True)\n", + "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" ] }, { @@ -147,7 +160,7 @@ " input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n", " return input_shape\n", "\n", - "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)" + "generalize_first_dimensions(image_data[\"images\"].shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)" ] }, { @@ -156,7 +169,7 @@ "metadata": {}, "outputs": [], "source": [ - "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n", + "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data[\"images\"].shape, 1))\n", "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n", "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n", "text_features = ct.TensorType(name=\"features\")\n", @@ -171,11 +184,11 @@ "metadata": {}, "outputs": [], "source": [ - "module = model.image_encoder\n", + "module = image_encoder\n", "module.eval()\n", "module.return_features = True\n", "\n", - "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n", + "traced_script_module = torch.jit.trace(module, example_inputs=image_data[\"images\"])\n", "traced_script_module" ] }, @@ -193,7 +206,7 @@ "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))" + "coreml_model.save(os.path.join(model_directory, \"image_encoder.mlpackage\"))" ] }, { @@ -202,7 +215,7 @@ "metadata": {}, "outputs": [], "source": [ - "module = model.text_encoder\n", + "module = text_encoder\n", "module.eval()\n", "module.return_features = True\n", "\n", @@ -224,7 +237,7 @@ "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))" + "coreml_model.save(os.path.join(model_directory, \"text_encoder.mlpackage\"))" ] }, { @@ -257,8 +270,8 @@ "metadata": {}, "outputs": [], "source": [ - "model.image_encoder.eval()\n", - "model.image_encoder.to(dtype=torch.bfloat16)" + "image_encoder.eval()\n", + "image_encoder.to(dtype=torch.bfloat16)" ] }, { @@ -267,7 +280,7 @@ "metadata": {}, "outputs": [], "source": [ - "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))" + "torch.save(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.pt\"))" ] }, { @@ -276,7 +289,7 @@ "metadata": {}, "outputs": [], "source": [ - "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))" + "save_file(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.safetensors\"))" ] }, { @@ -285,8 +298,8 @@ "metadata": {}, "outputs": [], "source": [ - "model.text_encoder.eval()\n", - "model.text_encoder.to(dtype=torch.bfloat16)" + "text_encoder.eval()\n", + "text_encoder.to(dtype=torch.bfloat16)" ] }, { @@ -295,7 +308,7 @@ "metadata": {}, "outputs": [], "source": [ - "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))" + "torch.save(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.pt\"))" ] }, { @@ -304,7 +317,7 @@ "metadata": {}, "outputs": [], "source": [ - "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))" + "save_file(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.safetensors\"))" ] }, { @@ -313,8 +326,8 @@ "metadata": {}, "outputs": [], "source": [ - "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n", - "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "image_features, image_embedding = image_encoder.forward(image_data[\"images\"].to(dtype=torch.bfloat16), return_features=True)\n", + "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n", "\n", "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" ] @@ -358,7 +371,7 @@ "metadata": {}, "outputs": [], "source": [ - "module = model.text_encoder\n", + "module = text_encoder\n", "module.eval()\n", "module.return_features = True\n", "module.to(dtype=torch.float32)\n", @@ -366,7 +379,7 @@ "onnx_export(\n", " module,\n", " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", - " os.path.join(output_directory, \"text_encoder.onnx\"), \n", + " os.path.join(model_directory, \"text_encoder.onnx\"), \n", " export_params=True,\n", " opset_version=15,\n", " do_constant_folding=True,\n", @@ -392,15 +405,15 @@ "metadata": {}, "outputs": [], "source": [ - "module = model.image_encoder\n", + "module = image_encoder\n", "module.eval()\n", "module.return_features = True\n", "module.to(dtype=torch.float32)\n", "\n", "torch.onnx.export(\n", " module,\n", - " image_data, \n", - " os.path.join(output_directory, \"image_encoder.onnx\"), \n", + " image_data[\"images\"], \n", + " os.path.join(model_directory, \"image_encoder.onnx\"), \n", " export_params=True,\n", " opset_version=15,\n", " do_constant_folding=True,\n", @@ -437,7 +450,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", "module = onnx.load(module_path)\n", "module_fp16 = float16.convert_float_to_float16(module)\n", "onnx.save(module_fp16, module_path)" @@ -449,7 +462,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", "module = onnx.load(module_path)\n", "module_fp16 = float16.convert_float_to_float16(module)\n", "onnx.save(module_fp16, module_path)" @@ -480,7 +493,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" ] }, @@ -490,7 +503,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" ] }, @@ -512,7 +525,7 @@ "from onnx import helper\n", "\n", "# Load the ONNX model\n", - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", "module = onnx.load(module_path)\n", "\n", "# Get the module's graph\n", @@ -599,7 +612,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", "session = ort.InferenceSession(module_path, sess_options=session_options)" ] }, @@ -609,7 +622,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", "session = ort.InferenceSession(module_path, sess_options=session_options)" ] }, @@ -620,6 +633,15 @@ "# Upload to Hugging Face" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../models/uform3-image-text-english-small/ . --exclude=\"torch_weight.pt\"" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index d26e4f2..bd26690 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -27,16 +27,16 @@ torch_models = [ "unum-cloud/uform3-image-text-english-small", - "unum-cloud/uform3-image-text-english-base", - "unum-cloud/uform3-image-text-english-large", - "unum-cloud/uform3-image-text-multilingual-base", + # "unum-cloud/uform3-image-text-english-base", + # "unum-cloud/uform3-image-text-english-large", + # "unum-cloud/uform3-image-text-multilingual-base", ] onnx_models = [ "unum-cloud/uform3-image-text-english-small", - "unum-cloud/uform3-image-text-english-base", - "unum-cloud/uform3-image-text-english-large", - "unum-cloud/uform3-image-text-multilingual-base", + # "unum-cloud/uform3-image-text-english-base", + # "unum-cloud/uform3-image-text-english-large", + # "unum-cloud/uform3-image-text-multilingual-base", ] # Let's check if the HuggingFace Hub API token is set in the environment variable. @@ -198,8 +198,8 @@ def test_onnx_one_embedding(model_name: str, device: str): # Test if the model outputs actually make sense cross_references_image_and_text_embeddings( - lambda text: model_text(processor_text(text)), - lambda image: model_image(processor_image(image)), + lambda text: model_text(processor_text(text))[1], + lambda image: model_image(processor_image(image))[1], ) except ExecutionProviderError as e: diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py index a5faca2..027bc0d 100644 --- a/python/uform/numpy_processors.py +++ b/python/uform/numpy_processors.py @@ -34,7 +34,7 @@ def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]: input_ids = np.full( (len(texts), self._max_seq_len), fill_value=self._pad_token_idx, - dtype=np.int64, + dtype=np.int32, ) attention_mask = np.zeros( diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index 9f63fa4..a6f27d3 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -64,6 +64,7 @@ def __init__( model_path: str, *, device: Literal["cpu", "cuda"] = "cpu", + return_features: bool = True, ): """ :param model_path: Path to onnx model @@ -73,14 +74,21 @@ def __init__( session_options = ort.SessionOptions() session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + self.return_features = return_features self.session = ort.InferenceSession( model_path, sess_options=session_options, providers=available_providers(device), ) - def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]: - return self.session.run(None, {"images": images}) + def __call__( + self, images: ndarray, return_features: Optional[bool] = None + ) -> Union[ndarray, Tuple[ndarray, ndarray]]: + features, embeddings = self.session.run(None, {"images": images}) + return_features = return_features if return_features is not None else self.return_features + if return_features: + return features, embeddings + return embeddings class TextEncoder: @@ -89,6 +97,7 @@ def __init__( model_path: str, *, device: Literal["cpu", "cuda"] = "cpu", + return_features: bool = True, ): """ :param text_encoder_path: Path to onnx of text encoder @@ -98,11 +107,31 @@ def __init__( session_options = ort.SessionOptions() session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + self.return_features = return_features self.text_encoder_session = ort.InferenceSession( model_path, sess_options=session_options, providers=available_providers(device), ) - def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]: - return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask}) + def __call__( + self, + x: Union[ndarray, dict], + attention_mask: Optional[ndarray] = None, + return_features: Optional[bool] = None, + ) -> Union[ndarray, Tuple[ndarray, ndarray]]: + if isinstance(x, dict): + assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None" + attention_mask = x["attention_mask"] + input_ids = x["input_ids"] + else: + input_ids = x + + features, embeddings = self.text_encoder_session.run( + None, {"input_ids": input_ids, "attention_mask": attention_mask} + ) + + return_features = return_features if return_features is not None else self.return_features + if return_features: + return features, embeddings + return embeddings diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index 8ac7c36..0504a74 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from os import PathLike -from typing import Dict, Optional, Tuple, Union, Callable +from typing import Dict, Optional, Union, Mapping, Any import json import torch @@ -274,7 +274,12 @@ def forward( return embeddings @staticmethod - def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> TextEncoder: + def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder: + """Load the image encoder from the given configuration and model path. + + :param config: the configuration dictionary or path to the JSON configuration file + :param model: the model state dictionary or path to the `.pt` model file + """ if isinstance(config, (PathLike, str)): config = json.load(open(config, "r")) if "text_encoder" in config: @@ -283,9 +288,15 @@ def from_pretrained(config: Union[PathLike, str, object], model_path: Union[Path # We must strip all the non-member attributes before initializing the classes. text_fields = TextEncoder.__dataclass_fields__ config = {k: v for k, v in config.items() if k in text_fields} - - state = torch.load(model_path) encoder = TextEncoder(**config) + + # Load from disk + if isinstance(model, (PathLike, str)): + state = torch.load(model) + else: + state = model + if "text_encoder" in state: + state = state["text_encoder"] encoder.load_state_dict(state) return encoder @@ -351,7 +362,15 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return embeddings @staticmethod - def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> ImageEncoder: + def from_pretrained( + config: Union[PathLike, str, object], + model: Union[PathLike, str, Mapping[str, Any]], + ) -> ImageEncoder: + """Load the image encoder from the given configuration and model path. + + :param config: the configuration dictionary or path to the JSON configuration file + :param model: the model state dictionary or path to the `.pt` model file + """ if isinstance(config, (PathLike, str)): config = json.load(open(config, "r")) if "image_encoder" in config: @@ -360,8 +379,14 @@ def from_pretrained(config: Union[PathLike, str, object], model_path: Union[Path # We must strip all the non-member attributes before initializing the classes. image_fields = ImageEncoder.__dataclass_fields__ config = {k: v for k, v in config.items() if k in image_fields} - - state = torch.load(model_path) encoder = ImageEncoder(**config) + + # Load from disk + if isinstance(model, (PathLike, str)): + state = torch.load(model) + else: + state = model + if "image_encoder" in state: + state = state["image_encoder"] encoder.load_state_dict(state) return encoder