Merge branch 'main' of https://github.com/ashvardanian/uform

ashvardanian · Apr 23, 2024 · 18a3bb6 · 18a3bb6
2 parents 3e1e576 + 9bf5fe3
commit 18a3bb6
Show file tree

Hide file tree

Showing 19 changed files with 1,254 additions and 139 deletions.
diff --git a/README.md b/README.md
@@ -51,13 +51,12 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr
 
 ### Embedding Models
 
-| Model                                    | Parameters | Languages |                                 Architecture |
-| :--------------------------------------- | ---------: | --------: | -------------------------------------------: |
-| [`uform-vl-english-large`][model-e-l] 🆕  |       365M |         1 | 6 text layers, ViT-L/14, 6 multimodal layers |
-| [`uform-vl-english`][model-e]            |       143M |         1 | 2 text layers, ViT-B/16, 2 multimodal layers |
-| [`uform-vl-english-small`][model-e-s] 🆕  |        79M |         1 | 2 text layers, ViT-S/16, 2 multimodal layers |
-| [`uform-vl-multilingual-v2`][model-m-v2] |       206M |        21 | 8 text layers, ViT-B/16, 4 multimodal layers |
-| [`uform-vl-multilingual`][model-m]       |       206M |        12 | 8 text layers, ViT-B/16, 4 multimodal layers |
+| Model                                               | Parameters | Languages |                                 Architecture |
+| :-------------------------------------------------- | ---------: | --------: | -------------------------------------------: |
+| [`uform3-image-text-english-large`][model-e-l] 🆕    |       365M |         1 | 6 text layers, ViT-L/14, 6 multimodal layers |
+| [`uform3-image-text-english-base`][model-e]         |       143M |         1 | 2 text layers, ViT-B/16, 2 multimodal layers |
+| [`uform3-image-text-english-small`][model-e-s] 🆕    |        79M |         1 | 2 text layers, ViT-S/16, 2 multimodal layers |
+| [`uform3-image-text-multilingual-base`][model-m-v2] |       206M |        21 | 8 text layers, ViT-B/16, 4 multimodal layers |
 
 [model-e-l]: https://huggingface.co/unum-cloud/uform-vl-english-large/
 [model-e]: https://huggingface.co/unum-cloud/uform-vl-english/
@@ -307,34 +306,18 @@ prompt_len = inputs['input_ids'].shape[1]
 decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
 ```
 
-### Multimodal Chat
+### Multimodal Chat in CLI
 
-The generative models can be used for chat-like experiences, where the user can provide both text and images as input.
-To use that feature, you can start with the following CLI command:
+The generative models can be used for chat-like experiences in the command line.
+For that, you can use the `uform-chat` CLI tool, which is available in the UForm package.
 
 ```bash
-uform-chat --model unum-cloud/uform-gen-chat --image=zebra.jpg
-uform-chat --model unum-cloud/uform-gen-chat \
-    --image="https://bit.ly/3tIVg9M" \
-    --device="cuda:0" \
-    --fp16
-```
-
-### Multi-GPU
-
-To achieve higher throughput, you can launch UForm on multiple GPUs.
-For that pick the encoder of the model you want to run in parallel (`text_encoder` or `image_encoder`), and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
-
-```python
-import uform
-
-model, processor = uform.get_model('unum-cloud/uform-vl-english')
-model_image = nn.DataParallel(model.image_encoder)
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model_image.to(device)
-
-_, res = model_image(images, 0)
+$ pip install uform
+$ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg
+$ uform-chat --model unum-cloud/uform-gen2-dpo \
+>     --image="https://bit.ly/3tIVg9M" \
+>     --device="cuda:0" \
+>     --fp16
 ```
 
 ## Evaluation
@@ -471,3 +454,8 @@ On Apple M2 Arm chips the energy efficiency of inference can exceed that of the
 ## License
 
 All models come under the same license as the code - Apache 2.0.
+
+
+TODO:
+
+- [ ] Download the image if a URL is provided
diff --git a/javascript/README.md b/javascript/README.md
@@ -1,10 +1,67 @@
 # UForm for JavaScript
 
+UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications.
+Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware.
 
+## Installation
+
+There are several ways to install the UForm JavaScript SDK from NPM.
 
 ```bash
-pnpm add uform
-npm add uform
-yarn add uform
+pnpm add uform 
+npm add uform  
+yarn add uform  
+```
+
+## Quick Start
+
+### Embeddings
+
+```js
+import { getModel, Modality } from 'uform';
+import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from 'uform';
+
+const { configPath, modalityPaths, tokenizerPath } = await getModel({
+    modelId: 'unum-cloud/uform3-image-text-english-small',
+    modalities: [Modality.TextEncoder, Modality.ImageEncoder],
+    token: null, // Optional Hugging Face token for private models
+    saveDir: null, // Optional directory to save the model to       
+});
+
+const textProcessor = new TextProcessor(configPath, tokenizerPath);
+await textProcessor.init();
+const processedTexts = await textProcessor.process("a small red panda in a zoo");
+
+const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+await textEncoder.init();
+const textOutput = await textEncoder.encode(processedTexts);
+assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
+await textEncoder.dispose();
+
+const imageProcessor = new ImageProcessor(configPath);
+await imageProcessor.init();
+const processedImages = await imageProcessor.process("path/to/image.png");
+
+const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+await imageEncoder.init();
+const imageOutput = await imageEncoder.encode(processedImages);
+assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
 ```
 
+The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK.
+The embeddings can later be compared using the cosine similarity or other distance metrics.
+
+### Generative Models
+
+Coming soon ...
+
+## Technical Details
+
+### Faster Search
+
+Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
+Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search.
+In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd].
+
+[github-usearch]: https://github.com/unum-cloud/usearch
+[github-simsimd]: https://github.com/ashvardanian/simsimd
diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
@@ -3,7 +3,7 @@ import { InferenceSession, Tensor } from 'onnxruntime-node';
 import { PreTrainedTokenizer } from '@xenova/transformers';
 import sharp from 'sharp';
 
-import { getCheckpoint, Modality } from "./hub.mjs";
+import { getModel, Modality } from "./hub.mjs";
 
 class TextProcessor {
 
@@ -66,7 +66,7 @@ class TextEncoder {
         }
     }
 
-    async forward(inputs) {
+    async encode(inputs) {
         if (!this.session) {
             throw new Error("Session is not initialized.");
         }
@@ -125,21 +125,22 @@ class ImageProcessor {
         this.normalizationMeans = config.normalization_means;
         this.normalizationDeviations = config.normalization_deviations;
 
-        this.imageMean = new Float32Array(this.normalizationMeans).fill(0);
-        this.imageStd = new Float32Array(this.normalizationDeviations).fill(0);
+        this.imageMean = new Float32Array(this.normalizationMeans);
+        this.imageStd = new Float32Array(this.normalizationDeviations);
     }
     async process(images) {
         const processSingle = async (image) => {
-            let img = sharp(image);
+            let img = sharp(image).toColorspace('srgb');
             const metadata = await img.metadata();
             const scale = this.imageSize / Math.min(metadata.width, metadata.height);
-            const scaledWidth = parseInt(metadata.width * scale);
-            const scaledHeight = parseInt(metadata.height * scale);
+            const scaledWidth = Math.ceil(metadata.width * scale);
+            const scaledHeight = Math.ceil(metadata.height * scale);
             img = img.resize({
                 width: scaledWidth,
                 height: scaledHeight,
                 fit: sharp.fit.cover,
-                position: sharp.strategy.entropy
+                position: sharp.strategy.entropy,
+                options: sharp.interpolators.bicubic
             }).extract({
                 left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)),
                 top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)),
@@ -148,12 +149,21 @@ class ImageProcessor {
             }).removeAlpha();
 
             let buffer = await img.raw().toBuffer();
-            let array = new Float32Array(buffer);
+            let array = new Float32Array(buffer.length);
+
+            // When we export into the `array`, we reorder the dimensions of the tensor 
+            // from HWC to CHW, and normalize the pixel values.
+            let channelSize = this.imageSize * this.imageSize;
+            for (let i = 0; i < this.imageSize * this.imageSize; i++) {
+                let r = buffer[i * 3];
+                let g = buffer[i * 3 + 1];
+                let b = buffer[i * 3 + 2];
+                array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0];
+                array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1];
+                array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2];
+            }
 
-            return array.map((value, index) => {
-                const channel = index % 3;
-                return (value / 255.0 - this.normalizationMeans[channel]) / this.normalizationDeviations[channel];
-            });
+            return array;
         };
 
         if (Array.isArray(images)) {
@@ -181,7 +191,7 @@ class ImageEncoder {
         }
     }
 
-    async forward(images) {
+    async encode(images) {
         if (!this.session) {
             throw new Error("Session is not initialized.");
         }

diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
@@ -4,7 +4,7 @@ import path from 'path';
 import assert from 'assert';
 import fetch from 'node-fetch';
 
-import { getCheckpoint, Modality } from "./hub.mjs";
+import { getModel, Modality } from "./hub.mjs";
 import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs";
 
 // Check if the HuggingFace Hub API token is set in the environment variable.
@@ -18,7 +18,7 @@ if (!hf_token) {
 }
 
 async function tryGettingCheckpoint(modelId, modalities) {
-    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
         modelId,
         modalities,
         hf_token,
@@ -60,7 +60,7 @@ async function testGetCheckpoint() {
 
 async function tryTextEncoderForwardPass(modelId) {
     const modalities = [Modality.TextEncoder];
-    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
         modelId,
         modalities,
         hf_token,
@@ -69,19 +69,19 @@ async function tryTextEncoderForwardPass(modelId) {
 
     const textProcessor = new TextProcessor(configPath, tokenizerPath);
     await textProcessor.init();
-    const processedTexts = await textProcessor.process("Hello, world!");
+    const processedTexts = await textProcessor.process("a small red panda in a zoo");
 
     const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
     await textEncoder.init();
-    const textOutput = await textEncoder.forward(processedTexts);
+    const textOutput = await textEncoder.encode(processedTexts);
     assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
 
     await textEncoder.dispose();
 }
 
 async function tryImageEncoderForwardPass(modelId) {
     const modalities = [Modality.ImageEncoder];
-    const { configPath, modalityPaths } = await getCheckpoint(
+    const { configPath, modalityPaths } = await getModel(
         modelId,
         modalities,
         hf_token,
@@ -94,7 +94,7 @@ async function tryImageEncoderForwardPass(modelId) {
 
     const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
     await imageEncoder.init();
-    const imageOutput = await imageEncoder.forward(processedImages);
+    const imageOutput = await imageEncoder.encode(processedImages);
     assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
 
     await imageEncoder.dispose();
@@ -135,7 +135,7 @@ async function fetchImage(url) {
 async function tryCrossReferencingImageAndText(modelId) {
 
     const modalities = [Modality.ImageEncoder, Modality.TextEncoder];
-    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
         modelId,
         modalities,
         hf_token,
@@ -177,12 +177,17 @@ async function tryCrossReferencingImageAndText(modelId) {
         const processedText = await textProcessor.process(text);
         const processedImage = await imageProcessor.process(imageBuffer);
 
-        const textEmbedding = await textEncoder.forward(processedText);
-        const imageEmbedding = await imageEncoder.forward(processedImage);
+        const textEmbedding = await textEncoder.encode(processedText);
+        const imageEmbedding = await imageEncoder.encode(processedImage);
 
-        textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data));
-        imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data));
-        console.log(`Text: ${text}, Image: ${imageUrl}, Similarity: ${cosineSimilarity(textEmbedding.embeddings, imageEmbedding.embeddings)}`);
+        textEmbeddings.push(new Float32Array(textEmbedding.embeddings.cpuData));
+        imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.cpuData));
+
+        // Print-based debugging at its best :)
+        // console.log(`Text: ${text}, Image: ${imageUrl}`);
+        // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`);
+        // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`);
+        console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`)
     }
 
     for (let i = 0; i < texts.length; i++) {
@@ -209,9 +214,9 @@ async function testEncoders() {
         // Go through the bi-modal models
         for (const modelId of [
             'unum-cloud/uform3-image-text-english-small',
-            'unum-cloud/uform3-image-text-english-base',
-            'unum-cloud/uform3-image-text-english-large',
-            'unum-cloud/uform3-image-text-multilingual-base',
+            // 'unum-cloud/uform3-image-text-english-base',
+            // 'unum-cloud/uform3-image-text-english-large',
+            // 'unum-cloud/uform3-image-text-multilingual-base',
         ]) {
             await tryTextEncoderForwardPass(modelId, hf_token);
             await tryImageEncoderForwardPass(modelId, hf_token);

diff --git a/javascript/hub.mjs b/javascript/hub.mjs
@@ -33,7 +33,7 @@ async function ensureDirectoryExists(dirPath) {
     }
 }
 
-async function getCheckpoint(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
+async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
     modalities = normalizeModalities(modalities);
 
     const configNames = ['config.json'];
@@ -101,4 +101,4 @@ async function getCheckpoint(modelId, modalities, token = null, format = '.onnx'
     return { configPath, modalityPaths, tokenizerPath };
 }
 
-export { getCheckpoint, Modality };
+export { getModel, Modality };
diff --git a/package.json b/package.json
@@ -7,6 +7,7 @@
   "dependencies": {
     "@huggingface/hub": "^0.14.8",
     "@xenova/transformers": "^2.17.0",
+    "node-fetch": "^3.3.2",
     "onnxruntime-node": "^1.17.0",
     "onnxruntime-web": "^1.17.3"
   },