Skip to content

Commit

Permalink
Fix position embeddings singular/plural (huggingface#33678)
Browse files Browse the repository at this point in the history
* fix position embeddings

* [run-slow] blip, blip_2, instructblip, instructblipvideo

* fix init

* [run-slow] blip, blip_2, instructblip, instructblipvideo

* fix copies

* [run-slow] blip, blip_2, instructblip, instructblipvideo

* [run-slow] blip, blip_2, instructblip, instructblipvideo

* handle exception where list + tensors are cat'd

* [run-slow] blip, blip_2, instructblip, instructblipvideo

* add missing default

* [run-slow] blip, blip_2, instructblip, instructblipvideo
  • Loading branch information
molbap authored and BernardZach committed Dec 6, 2024
1 parent d143710 commit 3f2c059
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 25 deletions.
9 changes: 4 additions & 5 deletions src/transformers/models/blip/modeling_blip.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,6 @@ def __init__(self, config: BlipVisionConfig):

self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))

# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
Expand All @@ -245,14 +244,14 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
"""

num_patches = embeddings.shape[1] - 1
num_positions = self.position_embeddings.shape[1] - 1
num_positions = self.position_embedding.shape[1] - 1

# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
return self.position_embedding

class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
class_pos_embed = self.position_embedding[:, :1]
patch_pos_embed = self.position_embedding[:, 1:]

dim = embeddings.shape[-1]

Expand Down
9 changes: 4 additions & 5 deletions src/transformers/models/blip_2/modeling_blip_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ def __init__(self, config: Blip2VisionConfig):

self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))

# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
Expand All @@ -212,14 +211,14 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
"""

num_patches = embeddings.shape[1] - 1
num_positions = self.position_embeddings.shape[1] - 1
num_positions = self.position_embedding.shape[1] - 1

# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
return self.position_embedding

class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
class_pos_embed = self.position_embedding[:, :1]
patch_pos_embed = self.position_embedding[:, 1:]

dim = embeddings.shape[-1]

Expand Down
9 changes: 4 additions & 5 deletions src/transformers/models/instructblip/modeling_instructblip.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def __init__(self, config: InstructBlipVisionConfig):

self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))

# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
Expand All @@ -116,14 +115,14 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
"""

num_patches = embeddings.shape[1] - 1
num_positions = self.position_embeddings.shape[1] - 1
num_positions = self.position_embedding.shape[1] - 1

# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
return self.position_embedding

class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
class_pos_embed = self.position_embedding[:, :1]
patch_pos_embed = self.position_embedding[:, 1:]

dim = embeddings.shape[-1]

Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/instructblip/processing_instructblip.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,10 @@ def __call__(
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, or a list of strings")

_text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])

# we have to concatenate lists - so we keep track of return_tensors here
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
_text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
output_kwargs["text_kwargs"]["return_tensors"] = return_tensors
# if we know how many query tokens, expand text inside processor. We need this hacky manipulation
# because BLIP expects image tokens to be at the beginning even before BOS token
if self.num_query_tokens is not None and images is not None:
Expand All @@ -145,9 +147,7 @@ def __call__(
)

# cast to desired return tensors type after concatenating
text_encoding = BatchEncoding(
text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
)
text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)

encoding.update(text_encoding)
qformer_text_encoding = self.qformer_tokenizer(text, **output_kwargs["text_kwargs"])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ def __init__(self, config: InstructBlipVideoVisionConfig):

self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))

# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
Expand All @@ -123,14 +122,14 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
"""

num_patches = embeddings.shape[1] - 1
num_positions = self.position_embeddings.shape[1] - 1
num_positions = self.position_embedding.shape[1] - 1

# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
return self.position_embedding

class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
class_pos_embed = self.position_embedding[:, :1]
patch_pos_embed = self.position_embedding[:, 1:]

dim = embeddings.shape[-1]

Expand Down

0 comments on commit 3f2c059

Please sign in to comment.