Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLaVA-NeXT-Video: fix generation with cache #32527

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ def forward(
inputs_embeds = self.get_input_embeddings()(input_ids)

# Merge text and images in prefill stage
if past_key_values is None:
if past_key_values is None or past_key_values.get_seq_length() == 0:
# First merge image tokens if there are any
if pixel_values is not None and pixel_values.size(0) > 0:
image_features = self._get_image_features(pixel_values, image_sizes)
Expand Down
12 changes: 12 additions & 0 deletions tests/models/blip/test_modeling_blip.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,7 @@ def test_model_from_pretrained(self):
@require_torch
class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
Expand All @@ -1116,6 +1117,17 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
def setUp(self):
self.model_tester = BlipTextImageModelsModelTester(self)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 19)

def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
Expand Down
16 changes: 14 additions & 2 deletions tests/models/blip_2/test_modeling_blip_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def __init__(
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=20,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
Expand Down Expand Up @@ -436,8 +436,9 @@ def prepare_config_and_inputs_for_common(self):


@require_torch
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
Expand All @@ -448,6 +449,17 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
def setUp(self):
self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == 21) # BLIP is special, so should be 21

def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
Expand Down
18 changes: 15 additions & 3 deletions tests/models/instructblip/test_modeling_instructblip.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
)
from transformers.utils import is_torch_available, is_vision_available

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
Expand Down Expand Up @@ -319,7 +318,7 @@ def __init__(
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=20,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
Expand Down Expand Up @@ -452,8 +451,9 @@ def prepare_config_and_inputs_for_common(self):


@require_torch
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
Expand All @@ -464,6 +464,18 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
def setUp(self):
self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
model.config.text_config.architectures = ["OptForCausalLM"]

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == 21) # BLIP is special, therefore 21

def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
Expand Down
20 changes: 15 additions & 5 deletions tests/models/instructblipvideo/test_modeling_instructblipvideo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
)
from transformers.utils import is_torch_available, is_vision_available

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
Expand Down Expand Up @@ -333,7 +332,7 @@ def __init__(
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=100,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
Expand Down Expand Up @@ -471,10 +470,9 @@ def prepare_config_and_inputs_for_common(self):


@require_torch
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
):
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
Expand All @@ -485,6 +483,18 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
def setUp(self):
self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
model.config.text_config.architectures = ["OptForCausalLM"]

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == 21) # BLIP is special, therefore 21

def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
Expand Down
11 changes: 11 additions & 0 deletions tests/models/kosmos2/test_modeling_kosmos2.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,17 @@ def setUp(self):
self.model_tester = Kosmos2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

# overwrite from common to skip `image_to_text_projection.latent_query`
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
Expand Down
12 changes: 12 additions & 0 deletions tests/models/llava/test_modeling_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
"""

all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
test_pruning = False
test_head_masking = False
Expand All @@ -186,6 +187,17 @@ def setUp(self):
self.model_tester = LlavaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
Expand Down
15 changes: 13 additions & 2 deletions tests/models/llava_next/test_modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
torch_device,
)

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
Expand Down Expand Up @@ -208,12 +207,13 @@ def create_and_check_llava_next_model_fp16_autocast_forward(


@require_torch
class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `LlavaNextForConditionalGeneration`.
"""

all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False

Expand All @@ -237,6 +237,17 @@ def test_initialization(self):
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
Expand Down
15 changes: 13 additions & 2 deletions tests/models/llava_next_video/test_modeling_llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
torch_device,
)

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
Expand Down Expand Up @@ -223,12 +222,13 @@ def create_and_check_llava_next_video_model_fp16_autocast_forward(


@require_torch
class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `LlavaNextVideoForConditionalGeneration`.
"""

all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False

Expand Down Expand Up @@ -274,6 +274,17 @@ def test_inputs_embeds(self):
with torch.no_grad():
model(**inputs)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
Expand Down
12 changes: 12 additions & 0 deletions tests/models/paligemma/test_modeling_paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
"""

all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_torchscript = False
Expand All @@ -185,6 +186,17 @@ def setUp(self):
self.model_tester = PaliGemmaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=PaliGemmaConfig, has_text_modality=False)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
Expand Down
15 changes: 13 additions & 2 deletions tests/models/video_llava/test_modeling_video_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
)
from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor

Expand Down Expand Up @@ -190,12 +189,13 @@ def prepare_config_and_inputs_for_batched_test(self):


@require_torch
class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `VideoLlavaForConditionalGeneration`.
"""

all_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True
Expand All @@ -205,6 +205,17 @@ def setUp(self):
self.model_tester = VideoLlavaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
Expand Down
12 changes: 12 additions & 0 deletions tests/models/vipllava/test_modeling_vipllava.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestC
"""

all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True
Expand All @@ -167,6 +168,17 @@ def setUp(self):
self.model_tester = VipLlavaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
Expand Down
Loading