linzhiqiu · chancharikmitra · Jun 28, 2024 · Jul 12, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -185,3 +185,11 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# Wheels
+*.whl
+
+# Videos
+# *.mp4
+# *.mov
+# *.avi
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Baiqi Li*, [Zhiqiu Lin*](https://linzhiqiu.github.io/), [Deepak Pathak](https://
 
 ## News
 
-- [2024/08/13] 🔥 **VQAScore** is highlighted in Google's [Imagen3 report](https://arxiv.org/abs/2408.07009) as the strongest replacement of CLIPScore for automated evaluation! **GenAI-Bench** was chosen as one of the key benchmark to showcase Imagen3's superior prompt-image alignment. Kudos to Google for this achievement! [[Paper](https://arxiv.org/abs/2408.07009)]
+- [2024/08/13] 🔥 **VQAScore** is highlighted in Google's [Imagen3 report](https://arxiv.org/abs/2408.07009) as the strongest replacement of CLIPScore for automated evaluation! **GenAI-Bench** was chosen as one of the key benchmarks to showcase Imagen3's superior prompt-image alignment. Kudos to Google for this achievement! [[Paper](https://arxiv.org/abs/2408.07009)]
 - [2024/07/01] 🔥 **VQAScore** has been accepted to ECCV 2024!
 - [2024/06/20] 🔥 **GenAI-Bench** won Best Short Paper at the CVPR'24 SynData Workshop! [[Workshop Site](https://syndata4cv.github.io/)].
 
@@ -72,6 +72,7 @@ scores = clip_flant5_score(images=images, texts=texts) # scores[i][j] is the sco
 - [Using GPT-4o for VQAScore](#using-gpt-4o-for-vqascore)
 - [Implementing your own scoring metric](#implementing-your-own-scoring-metric)
 - [Text generation (VQA) using CLIP-FlanT5](#text-generation-vqa-using-clip-flant5)
+- [Video-text alignment scores](#video-text-alignment-scores)
 
 ### Batch processing for more image-text pairs
 With a large batch of M images x N texts, you can speed up using the ``batch_forward()`` function. 
@@ -184,6 +185,71 @@ images = ["images/0.png", "images/0.png"] # A list of images
 prompts = ["Please describe this image: ", "Does the image show 'someone talks on the phone angrily while another person sits happily'?"] # Corresponding prompts
 clip_flant5_score.model.generate(images=images, prompts=prompts)
 ```
+Note that this feature is only supported for version 4.36.1 of the `transformers` package (i.e.`pip install transformers==4.36.1`).
+
+### Video-Text Alignment Scores
+
+We now support video-text alignment scores, including video-CLIPScore with InternVideo2, and video-VQAScore with LLaVA-OneVision, mPlug-Owl3, and CLIP-FlanT5. To get started, please install `flash-attn':
+
+```
+pip install flash-attn --no-build-isolation
+```
+
+For single-image and CLIP-like models, video frames are concatenated. For all other native interleaved-image/video models (we recommend LLaVA-OneVision at the time of writing), video frames are passed directly to the model.
+
+```python
+import t2v_metrics
+
+### For a single (video, text) pair on a image-only VQA model:
+clip_flant5_score = t2v_metrics.VQAScore(model='clip-flant5-xxl') 
+video = "videos/baby.mp4" # a video path in string format
+text = "a baby crying"
+score = clip_flant5_score(videos=[video], texts=[text], concatenate='horizontal', num_frames=4) # For native interleaved-image/video LMM models (like LLaVA-OneVision), the 'concatenate' argument is unecessary.
+
+### For a single (video, text) pair on an interleaved-image/video VQA model:
+llava_ov_score = t2v_metrics.VQAScore(model='llava-onevision-qwen2-7b-ov') 
+video = "videos/baby.mp4" # a video path in string format
+text = "a baby crying"
+score = llava_ov_score(videos=[video], texts=[text], num_frames=4) 
+
+### Alternatively, if you want to calculate the pairwise similarity scores 
+### between M videos and N texts, run the following to return a M x N score tensor.
+videos = ["videos/baby.mp4", "video/ducks.mp4"]
+texts = ["a baby crying",
+         "a group of ducks standing in the water"]
+score = llava_ov_score(videos=[videos], texts=[text], num_frames=4) # scores[i][j] is the score between video i and text j
+```
+
+### Natural Language Text Generation:
+
+We also support natural language text generation for image/video inputs on any VQAScore model. Here is a representative example:
+```python
+import t2v_metrics
+
+### For a single (video, text) pair on an interleaved-image/video VQA model:
+llava_ov_score = t2v_metrics.VQAScore(model='llava-onevision-qwen2-7b-ov') 
+video = "videos/baby.mp4" # a video path in string format
+text = "What is the baby doing in this video?"
+generated_text = llava_ov_score(videos=[video], texts=[text], num_frames=4, max_new_tokens=512,  generate=True)
+
+### Alternatively, if you want to output a list of generations from a batch of paired vision inputs and prompts.
+videos = ["videos/baby.mp4", "video/ducks.mp4"]
+texts = ["What is the baby doing in this video?",
+         "How many ducks are there?"]
+generated_text = llava_ov_score(videos=[video], texts=[text], num_frames=4, max_new_tokens=512,  generate=True)
+```
+## Contributions
+
+- **[Zhiqiu Lin](https://x.com/ZhiqiuLin)**, **[Jean de Nyandwi](https://x.com/Jeande_d)**, **[Chancharik Mitra](https://x.com/chancharikm)**  
+  Implemented image-based **CLIPScore** and **VQAScore** for:  
+  CLIP-FlanT5, GPT-4o, LLaVA-1.5, InstructBLIP, OpenCLIP, HPSv2, ImageReward, PickScore.
+
+- **Baiqi Li**  
+  Implemented **GenAI-Bench** and **GenAI-Rank** benchmarks.
+
+- **[Chancharik Mitra](https://x.com/chancharikm)**  
+  Implemented video-based **VQAScore** for:  
+  LLaVA-OneVision, InternVideo2, mPLUG-Owl3, PaLI-Gemma, InternVL2, InternLMXC2.5.
 
 ## Citation
 

diff --git a/SimSun.ttf b/SimSun.ttf
diff --git a/dataset.py b/dataset.py
@@ -826,7 +826,6 @@ def __init__(self,
 
                     # Append the PIL Image path to the list
                     current_frames.append(output_path)
-
                 if len(current_frames) < num_frames:
                     current_frames = current_frames + [current_frames[-1]] * (num_frames - len(current_frames))
                 elif len(current_frames) > num_frames:

diff --git a/eval.py b/eval.py
@@ -41,8 +41,6 @@ def main():
         TIFA160_DSG,
         Pickapic_v1,
         SeeTrue,
-        StanfordT23D,
-        T2VScore,
         Flickr8K_CF,
     ]:
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "pandas>=2.1.4",
     "scipy>=1.11.4",
     "sentencepiece>=0.1.99",
-    "transformers>=4.36.1",
+    "transformers",
     "datasets>=2.15.0",
     "tokenizers",
     "omegaconf",
@@ -40,8 +40,41 @@ dependencies = [
     "pycocoevalcap",
     "image-reward",
     "hpsv2",
+    # Video Model Additional Requirements
+    "llava @ git+https://github.com/LLaVA-VL/LLaVA-NeXT.git",
     "fire==0.4.0",
     "tiktoken>=0.7.0",
+    "peft==0.5.0",
+    "decord",
+    "easydict",
+    "av",
+    "icecream",
+    "markdown2",
+    "pydantic",
+    "accelerate",
+    "shortuuid",
+    "bitsandbytes",
+    "timm",
+    "requests",
+    "httpx==0.24.0",
+    "uvicorn",
+    "einops-exts",
+    "einops",
+    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl",
+    "PyYAML",
+    "wandb",
+    "moviepy",
+    "google-generativeai>=0.8.2",
+    "apex==0.9.10dev",
+    "deepspeed==0.10.1",
+    "fvcore==0.1.5.post20221221",
+    "imageio==2.31.1",
+    "librosa==0.10.1",
+    "Pillow==10.0.0",
+    "psutil==5.9.5",
+    "soundfile==0.12.1",
+    "termcolor==2.4.0",
+    "qwen-vl-utils"
 ]
 
 [tool.setuptools]

diff --git a/t2v_metrics/__init__.py b/t2v_metrics/__init__.py
@@ -2,6 +2,10 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models/clipscore_models/InternVideo2/multi_modality/'))
 
 from .constants import HF_CACHE_DIR
 from .vqascore import VQAScore, list_all_vqascore_models

diff --git a/t2v_metrics/clipscore.py b/t2v_metrics/clipscore.py
@@ -10,11 +10,14 @@ class CLIPScore(Score):
     def prepare_scoremodel(self,
                            model='openai:ViT-L/14',
                            device='cuda',
-                           cache_dir=HF_CACHE_DIR):
+                           cache_dir=HF_CACHE_DIR,
+                           **kwargs):
+        print(f'{kwargs["model_path"]}')
         return get_clipscore_model(
             model,
             device=device,
-            cache_dir=cache_dir
+            cache_dir=cache_dir,
+            **kwargs
         )
 
     def list_all_models(self) -> List[str]:

diff --git a/t2v_metrics/models/clipscore_models/__init__.py b/t2v_metrics/models/clipscore_models/__init__.py
@@ -2,27 +2,31 @@
 from .blip2_itc_model import BLIP2_ITC_MODELS, BLIP2ITCScoreModel
 from .hpsv2_model import HPSV2_MODELS, HPSV2ScoreModel
 from .pickscore_model import PICKSCORE_MODELS, PickScoreModel
+from .internvideo2clip_model import INTERNVIDEO2_MODELS, InternVideo2Model
 from ...constants import HF_CACHE_DIR
 
 ALL_CLIP_MODELS = [
     CLIP_MODELS,
     BLIP2_ITC_MODELS,
     HPSV2_MODELS,
     PICKSCORE_MODELS,
+    INTERNVIDEO2_MODELS
 ]
 
 def list_all_clipscore_models():
     return [model for models in ALL_CLIP_MODELS for model in models]
 
-def get_clipscore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR):
+def get_clipscore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR, **kwargs):
     assert model_name in list_all_clipscore_models()
     if model_name in CLIP_MODELS:
-        return CLIPScoreModel(model_name, device=device, cache_dir=cache_dir)
+        return CLIPScoreModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
     elif model_name in BLIP2_ITC_MODELS:
-        return BLIP2ITCScoreModel(model_name, device=device, cache_dir=cache_dir)
+        return BLIP2ITCScoreModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
     elif model_name in HPSV2_MODELS:
-        return HPSV2ScoreModel(model_name, device=device, cache_dir=cache_dir)
+        return HPSV2ScoreModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
     elif model_name in PICKSCORE_MODELS:
-        return PickScoreModel(model_name, device=device, cache_dir=cache_dir)
+        return PickScoreModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in INTERNVIDEO2_MODELS:
+        return InternVideo2Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
     else:
         raise NotImplementedError()
diff --git a/t2v_metrics/models/clipscore_models/blip2_itc_model.py b/t2v_metrics/models/clipscore_models/blip2_itc_model.py
@@ -15,11 +15,14 @@
 
 class BLIP2ITCScoreModel(ScoreModel):
     "A wrapper for BLIP-2 ITCScore models"
+    video_mode = "concat"
+    allows_image = True
     def __init__(self,
                  model_name='blip2-itc',
                  device='cuda',
                  cache_dir=HF_CACHE_DIR):
         assert model_name in BLIP2_ITC_MODELS, f"Model name must be one of {BLIP2_ITC_MODELS.keys()}"
+
         os.environ['TORCH_HOME'] = cache_dir
         import timm.models.hub as timm_hub
         super().__init__(model_name=model_name,

diff --git a/t2v_metrics/models/clipscore_models/clip_model.py b/t2v_metrics/models/clipscore_models/clip_model.py
@@ -10,11 +10,14 @@
 
 class CLIPScoreModel(ScoreModel):
     "A wrapper for OpenCLIP models (including openAI's CLIP, OpenCLIP, DatacompCLIP)"
+    video_mode = "concat"
+    allows_image = True
     def __init__(self,
                  model_name='openai:ViT-L-14',
                  device='cuda',
                  cache_dir=HF_CACHE_DIR):
         assert model_name in CLIP_MODELS
+
         super().__init__(model_name=model_name,
                          device=device,
                          cache_dir=cache_dir)

diff --git a/t2v_metrics/models/clipscore_models/hpsv2_model.py b/t2v_metrics/models/clipscore_models/hpsv2_model.py
@@ -8,6 +8,8 @@
 
 class HPSV2ScoreModel(ScoreModel):
     "A wrapper for HPSv2 models "
+    video_mode = "concat"
+    allows_image = True
     def __init__(self,
                  model_name='openai:ViT-L-14',
                  device='cuda',

diff --git a/t2v_metrics/models/clipscore_models/pickscore_model.py b/t2v_metrics/models/clipscore_models/pickscore_model.py
@@ -11,11 +11,14 @@
 
 class PickScoreModel(ScoreModel):
     "A wrapper for PickScore models"
+    video_mode = "concat"
+    allows_image = True
     def __init__(self,
                  model_name='pickscore-v1',
                  device='cuda',
                  cache_dir=HF_CACHE_DIR):
         assert model_name in PICKSCORE_MODELS
+
         super().__init__(model_name=model_name,
                          device=device,
                          cache_dir=cache_dir)

diff --git a/t2v_metrics/models/itmscore_models/blip2_itm_model.py b/t2v_metrics/models/itmscore_models/blip2_itm_model.py
@@ -15,6 +15,8 @@
 
 class BLIP2ITMScoreModel(ScoreModel):
     "A wrapper for BLIP-2 ITMScore models"
+    video_mode = "concat"
+    allows_image = True
     def __init__(self,
                  model_name='blip2-itm',
                  device='cuda',

diff --git a/t2v_metrics/models/itmscore_models/image_reward_model.py b/t2v_metrics/models/itmscore_models/image_reward_model.py
@@ -13,6 +13,8 @@
 
 class ImageRewardScoreModel(ScoreModel):
     "A wrapper for ImageReward ITMScore (finetuned on human preference) models"
+    video_mode = "concat"
+    allows_image = True
     def __init__(self,
                  model_name='image-reward-v1',
                  device='cuda',

diff --git a/t2v_metrics/models/vqascore_models/__init__.py b/t2v_metrics/models/vqascore_models/__init__.py
@@ -3,6 +3,17 @@
 from .llava16_model import LLAVA16_MODELS, LLaVA16Model
 from .instructblip_model import InstructBLIP_MODELS, InstructBLIPModel
 from .gpt4v_model import GPT4V_MODELS, GPT4VModel
+from .llavaov_model import LLAVA_OV_MODELS, LLaVAOneVisionModel
+from .mplug_model import MPLUG_OWL3_MODELS, mPLUGOwl3Model
+from .paligemma_model import PALIGEMMA_MODELS, PaliGemmaModel
+from .internvl_model import INTERNVL2_MODELS, InternVL2Model
+from .internvideo_model import INTERNVIDEO2_MODELS, InternVideo2Model
+from .internlm_model import INTERNLMXCOMPOSER25_MODELS, InternLMXComposer25Model
+from .llama32_model import LLAMA_32_VISION_MODELS, LLaMA32VisionModel
+from .molmo_model import MOLMO_MODELS, MOLMOVisionModel
+from .gemini_model import GEMINI_MODELS, GeminiModel
+from .qwen2vl_model import QWEN2_VL_MODELS, Qwen2VLModel
+
 from ...constants import HF_CACHE_DIR
 
 ALL_VQA_MODELS = [
@@ -11,8 +22,19 @@
     LLAVA16_MODELS,
     InstructBLIP_MODELS,
     GPT4V_MODELS,
+    LLAVA_OV_MODELS,
+    MPLUG_OWL3_MODELS,
+    PALIGEMMA_MODELS,
+    INTERNVL2_MODELS,
+    INTERNVIDEO2_MODELS,
+    INTERNLMXCOMPOSER25_MODELS,
+    LLAMA_32_VISION_MODELS,
+    MOLMO_MODELS,
+    GEMINI_MODELS,
+    QWEN2_VL_MODELS
 ]
 
+
 def list_all_vqascore_models():
     return [model for models in ALL_VQA_MODELS for model in models]
 
@@ -28,5 +50,25 @@ def get_vqascore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR, **kwar
         return InstructBLIPModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
     elif model_name in GPT4V_MODELS:
         return GPT4VModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in LLAVA_OV_MODELS:
+        return LLaVAOneVisionModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in MPLUG_OWL3_MODELS:
+        return mPLUGOwl3Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in PALIGEMMA_MODELS:
+        return PaliGemmaModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in INTERNVL2_MODELS:
+        return InternVL2Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in INTERNVIDEO2_MODELS:
+        return InternVideo2Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in INTERNLMXCOMPOSER25_MODELS:
+        return InternLMXComposer25Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in LLAMA_32_VISION_MODELS:
+        return LLaMA32VisionModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in MOLMO_MODELS:
+        return MOLMOVisionModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in GEMINI_MODELS:
+        return GeminiModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in QWEN2_VL_MODELS:
+        return Qwen2VLModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
     else:
         raise NotImplementedError()
diff --git a/t2v_metrics/models/vqascore_models/clip_t5_model.py b/t2v_metrics/models/vqascore_models/clip_t5_model.py
@@ -158,6 +158,8 @@ def format_answer(answer, conversation_style='plain'):
 
 class CLIPT5Model(VQAScoreModel):
     """A wrapper for the CLIP-FlanT5 or CLIP-T5 models"""
+    video_mode = "concat"
+    allows_image = True
     def __init__(self,
                  model_name='clip-flant5-xxl',
                  device='cuda',