-
Notifications
You must be signed in to change notification settings - Fork 27.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Paligemma support for multi-image (#33447)
* upadte * Update src/transformers/models/paligemma/processing_paligemma.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * update docs * better example in tests * support image tokens * read token * Update tests/models/paligemma/test_processing_paligemma.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * nit: naming * Update docs/source/en/model_doc/paligemma.md Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * conflicts after rebasing --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
- Loading branch information
1 parent
55b7a04
commit 3e039d3
Showing
4 changed files
with
223 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# Copyright 2024 The HuggingFace Team. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
from transformers import AutoProcessor, GemmaTokenizerFast, PaliGemmaProcessor | ||
from transformers.testing_utils import require_read_token, require_vision | ||
from transformers.utils import is_vision_available | ||
|
||
from ...test_processing_common import ProcessorTesterMixin | ||
|
||
|
||
if is_vision_available(): | ||
from transformers import SiglipImageProcessor | ||
|
||
|
||
@require_vision | ||
@require_read_token | ||
class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase): | ||
processor_class = PaliGemmaProcessor | ||
|
||
def setUp(self): | ||
self.tmpdirname = tempfile.mkdtemp() | ||
image_processor = SiglipImageProcessor(do_center_crop=False) | ||
tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-7b") | ||
image_processor.image_seq_length = 32 | ||
|
||
processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer) | ||
processor.save_pretrained(self.tmpdirname) | ||
|
||
def get_tokenizer(self, **kwargs): | ||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer | ||
|
||
def get_image_processor(self, **kwargs): | ||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor | ||
|
||
def tearDown(self): | ||
shutil.rmtree(self.tmpdirname) | ||
|
||
def test_text_with_image_tokens(self): | ||
image_processor = self.get_component("image_processor") | ||
tokenizer = self.get_component("tokenizer") | ||
|
||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) | ||
text_multi_images = "<image><image><bos>Dummy text!" | ||
text_single_image = "<image><bos>Dummy text!" | ||
text_no_image = "Dummy text!" | ||
|
||
image = self.prepare_image_inputs()[0] | ||
|
||
out_noimage = processor(text=text_no_image, images=image, return_tensors="np") | ||
out_singlimage = processor(text=text_single_image, images=image, return_tensors="np") | ||
for k in out_noimage: | ||
self.assertTrue(out_noimage[k].tolist() == out_singlimage[k].tolist()) | ||
|
||
out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np") | ||
out_noimage = processor(text=text_no_image, images=[[image, image]], return_tensors="np") | ||
|
||
# We can't be sure what is users intention, whether user want "one text + two images" or user forgot to add the second text | ||
with self.assertRaises(ValueError): | ||
out_noimage = processor(text=text_no_image, images=[image, image], return_tensors="np") | ||
|
||
for k in out_noimage: | ||
self.assertTrue(out_noimage[k].tolist() == out_multiimages[k].tolist()) | ||
|
||
text_batched = ["Dummy text!", "Dummy text!"] | ||
text_batched_with_image = ["<image><bos>Dummy text!", "<image><bos>Dummy text!"] | ||
out_images = processor(text=text_batched_with_image, images=[image, image], return_tensors="np") | ||
out_noimage_nested = processor(text=text_batched, images=[[image], [image]], return_tensors="np") | ||
out_noimage = processor(text=text_batched, images=[image, image], return_tensors="np") | ||
for k in out_noimage: | ||
self.assertTrue(out_noimage[k].tolist() == out_images[k].tolist() == out_noimage_nested[k].tolist()) |