Merge pull request huggingface#14 from xenova/clip

Add CLIP model
ocavue · Mar 15, 2023 · 4fdbc27 · 4fdbc27
2 parents 00273ff + 1b23229
commit 4fdbc27
Show file tree

Hide file tree

Showing 16 changed files with 522 additions and 172 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,12 @@
 # Transformers.js
-![https://www.npmjs.com/package/@xenova/transformers](https://img.shields.io/npm/v/@xenova/transformers) ![https://www.npmjs.com/package/@xenova/transformers](https://img.shields.io/npm/dw/@xenova/transformers)
-![https://github.com/xenova/transformers.js/blob/main/LICENSE](https://img.shields.io/github/license/xenova/transformers.js)
 
-Run 🤗 Transformers in your browser! We currently support [BERT](https://huggingface.co/docs/transformers/model_doc/bert), [ALBERT](https://huggingface.co/docs/transformers/model_doc/albert), [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert), [T5](https://huggingface.co/docs/transformers/model_doc/t5), [T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1), [FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5), [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2), [BART](https://huggingface.co/docs/transformers/model_doc/bart), [CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen), [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper), [Vision Transformer](https://huggingface.co/docs/transformers/model_doc/vit), and [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder) models, for a variety of tasks including: masked language modelling, text classification, text-to-text generation, translation, summarization, question answering, text generation, automatic speech recognition, image classification, and image-to-text.
+
+[![npm](https://img.shields.io/npm/v/@xenova/transformers)](https://www.npmjs.com/package/@xenova/transformers)
+[![downloads](https://img.shields.io/npm/dw/@xenova/transformers)](https://www.npmjs.com/package/@xenova/transformers)
+[![license](https://img.shields.io/github/license/xenova/transformers.js)](https://github.com/xenova/transformers.js/blob/main/LICENSE)
+
+
+Run 🤗 Transformers in your browser! We currently support [BERT](https://huggingface.co/docs/transformers/model_doc/bert), [ALBERT](https://huggingface.co/docs/transformers/model_doc/albert), [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert), [T5](https://huggingface.co/docs/transformers/model_doc/t5), [T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1), [FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5), [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2), [BART](https://huggingface.co/docs/transformers/model_doc/bart), [CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen), [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper), [CLIP](https://huggingface.co/docs/transformers/model_doc/clip), [Vision Transformer](https://huggingface.co/docs/transformers/model_doc/vit), and [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder) models, for a variety of tasks including: masked language modelling, text classification, text-to-text generation, translation, summarization, question answering, text generation, automatic speech recognition, image classification, zero-shot image classification, and image-to-text.
 
 ![teaser](https://user-images.githubusercontent.com/26504141/221056008-e906614e-e6f0-4e10-b0a8-7d5c99e955b4.gif)
 

diff --git a/assets/js/scripts.js b/assets/js/scripts.js
@@ -61,10 +61,19 @@ const IMAGE_CLASSIFICATION_OUTPUT_CANVAS = document.getElementById('ic-canvas');
 const CODE_COMPLETION_CONTAINER = document.getElementById('code-completion-container');
 
 
+const ZSIC_SELECT = document.getElementById('zsic-select');
+const ZSIC_INPUT = document.getElementById('zsic-file');
+const ZSIC_CLASSES = document.getElementById('zsic-classes');
+const ZSIC_IMG = document.getElementById('zsic-viewer');
+const ZSIC_OUTPUT_CANVAS = document.getElementById('zsic-canvas');
+
+
+
 [
 	[SPEECH2TEXT_SELECT, SPEECH2TEXT_INPUT, SPEECH2TEXT_AUDIO],
 	[TEXT2IMAGE_SELECT, TEXT2IMAGE_INPUT, TEXT2IMAGE_IMG],
 	[IMAGE_CLASSIFICATION_SELECT, IMAGE_CLASSIFICATION_INPUT, IMAGE_CLASSIFICATION_IMG],
+	[ZSIC_SELECT, ZSIC_INPUT, ZSIC_IMG],
 ].forEach(x => {
 	let [select, input, media] = x;
 
@@ -214,10 +223,36 @@ const CHARTS = {
 			}]
 		},
 		options: CHART_OPTIONS
-	})
+	}),
+
+	'zsic-canvas': new Chart(ZSIC_OUTPUT_CANVAS, {
+		type: 'bar',
+		data: {
+			labels: ['football', 'airport', 'animals'],
+			datasets: [{
+				borderWidth: 1
+			}]
+		},
+		options: CHART_OPTIONS
+	}),
+
 }
 
 
+function getZSICClasses() {
+	return ZSIC_CLASSES.value.split(/\s*,+\s*/g).filter(x => x)
+
+}
+ZSIC_CLASSES.addEventListener('input', () => {
+	// Update labels of graph
+	let chartToUpdate = CHARTS[ZSIC_OUTPUT_CANVAS.id];
+
+	chartToUpdate.data.labels = getZSICClasses();
+	chartToUpdate.update();
+})
+
+
+
 function updateVisibility() {
 	for (let element of TASKS) {
 		if (element.getAttribute('task').split(',').includes(TASK_SELECTOR.value)) {
@@ -228,6 +263,7 @@ function updateVisibility() {
 	}
 }
 updateVisibility();
+
 // Add event listeners
 TASK_SELECTOR.addEventListener('input', updateVisibility);
 
@@ -323,6 +359,15 @@ GENERATE_BUTTON.addEventListener('click', async (e) => {
 			data.updateLabels = true
 			break;
 
+
+		case 'zero-shot-image-classification':
+			data.image = getImageDataFromImage(ZSIC_IMG)
+			data.classes = getZSICClasses()
+			data.elementIdToUpdate = ZSIC_OUTPUT_CANVAS.id
+			data.targetType = 'chart'
+			data.updateLabels = true
+			break;
+
 		default:
 			return;
 	}

diff --git a/assets/js/worker.js b/assets/js/worker.js
@@ -23,7 +23,8 @@ const TASK_FUNCTION_MAPPING = {
     'summarization': summarize,
     'automatic-speech-recognition': speech_to_text,
     'image-to-text': image_to_text,
-    'image-classification': image_classification
+    'image-classification': image_classification,
+    'zero-shot-image-classification': zero_shot_image_classification,
 }
 
 // Listen for messages from UI
@@ -117,6 +118,12 @@ class ImageClassificationPipelineFactory extends PipelineFactory {
     static model = 'google/vit-base-patch16-224';
 }
 
+
+class ZeroShotImageClassificationPipelineFactory extends PipelineFactory {
+    static task = 'zero-shot-image-classification';
+    static model = 'openai/clip-vit-base-patch16';
+}
+
 async function translate(data) {
 
     let pipeline = await TranslationPipelineFactory.getInstance(data => {
@@ -364,4 +371,28 @@ async function image_classification(data) {
         data: outputs
     });
 
-}
+}
+
+
+async function zero_shot_image_classification(data) {
+    let pipeline = await ZeroShotImageClassificationPipelineFactory.getInstance(data => {
+        self.postMessage({
+            type: 'download',
+            task: 'image-classification',
+            data: data
+        });
+    })
+
+    let outputs = await pipeline(data.image, data.classes)
+
+    self.postMessage({
+        type: 'complete',
+        target: data.elementIdToUpdate,
+        targetType: data.targetType,
+        updateLabels: data.updateLabels,
+        data: outputs
+    });
+
+}
+
+
diff --git a/dist/transformers.js b/dist/transformers.js
diff --git a/dist/transformers.js.map b/dist/transformers.js.map
diff --git a/dist/transformers.min.js b/dist/transformers.min.js
diff --git a/dist/transformers.min.js.map b/dist/transformers.min.js.map