linzhiqiu · chancharikmitra · Jun 28, 2024 · Jul 12, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -185,3 +185,11 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# Wheels
+*.whl
+
+# Videos
+# *.mp4
+# *.mov
+# *.avi
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Baiqi Li*, [Zhiqiu Lin*](https://linzhiqiu.github.io/), [Deepak Pathak](https://
 
 ## News
 
-- [2024/08/13] 🔥 **VQAScore** is highlighted in Google's [Imagen3 report](https://arxiv.org/abs/2408.07009) as the strongest replacement of CLIPScore for automated evaluation! **GenAI-Bench** was chosen as one of the key benchmark to showcase Imagen3's superior prompt-image alignment. Kudos to Google for this achievement! [[Paper](https://arxiv.org/abs/2408.07009)]
+- [2024/08/13] 🔥 **VQAScore** is highlighted in Google's [Imagen3 report](https://arxiv.org/abs/2408.07009) as the strongest replacement of CLIPScore for automated evaluation! **GenAI-Bench** was chosen as one of the key benchmarks to showcase Imagen3's superior prompt-image alignment. Kudos to Google for this achievement! [[Paper](https://arxiv.org/abs/2408.07009)]
 - [2024/07/01] 🔥 **VQAScore** has been accepted to ECCV 2024!
 - [2024/06/20] 🔥 **GenAI-Bench** won Best Short Paper at the CVPR'24 SynData Workshop! [[Workshop Site](https://syndata4cv.github.io/)].
 
@@ -72,6 +72,7 @@ scores = clip_flant5_score(images=images, texts=texts) # scores[i][j] is the sco
 - [Using GPT-4o for VQAScore](#using-gpt-4o-for-vqascore)
 - [Implementing your own scoring metric](#implementing-your-own-scoring-metric)
 - [Text generation (VQA) using CLIP-FlanT5](#text-generation-vqa-using-clip-flant5)
+- [Video-text alignment scores](#video-text-alignment-scores)
 
 ### Batch processing for more image-text pairs
 With a large batch of M images x N texts, you can speed up using the ``batch_forward()`` function. 
@@ -184,6 +185,53 @@ images = ["images/0.png", "images/0.png"] # A list of images
 prompts = ["Please describe this image: ", "Does the image show 'someone talks on the phone angrily while another person sits happily'?"] # Corresponding prompts
 clip_flant5_score.model.generate(images=images, prompts=prompts)
 ```
+Note that this feature is only supported for version 4.36.1 of the `transformers` package (i.e.`pip install transformers==4.36.1`).
+
+### Video-Text Alignment Scores
+
+We now support video-text alignment scores, including video-CLIPScore with InternVideo2, and video-VQAScore with LLaVA-OneVision, mPlug-Owl3, and CLIP-FlanT5. To get started, please install `flash-attn':
+
+```
+pip install flash-attn --no-build-isolation
+```
+
+For single-image and CLIP-like models, video frames are concatenated. For all other native interleaved-image/video models (we recommend LLaVA-OneVision at the time of writing), video frames are passed directly to the model.
+
+```python
+import t2v_metrics
+
+### For a single (video, text) pair on a image-only VQA model:
+clip_flant5_score = t2v_metrics.VQAScore(model='clip-flant5-xxl') 
+video = "videos/baby.mp4" # a video path in string format
+text = "a baby crying"
+score = clip_flant5_score(videos=[video], texts=[text], concatenate='horizontal', num_frames=4) # For native interleaved-image/video LMM models (like LLaVA-OneVision), the 'concatenate' argument is unecessary.
+
+### For a single (video, text) pair on an interleaved-image/video VQA model:
+llava_ov_score = t2v_metrics.VQAScore(model='llava-onevision-qwen2-7b-ov') 
+video = "videos/baby.mp4" # a video path in string format
+text = "a baby crying"
+score = llava_ov_score(videos=[video], texts=[text], num_frames=4) 
+
+### Alternatively, if you want to calculate the pairwise similarity scores 
+### between M videos and N texts, run the following to return a M x N score tensor.
+videos = ["videos/baby.mp4", "video/ducks.mp4"]
+texts = ["a baby crying",
+         "a group of ducks standing in the water"]
+score = llava_ov_score(videos=[videos], texts=[text], num_frames=4) # scores[i][j] is the score between video i and text j
+```
+
+## Contributions
+
+- **[Zhiqiu Lin](https://x.com/ZhiqiuLin)**, **[Jean de Nyandwi](https://x.com/Jeande_d)**, **[Chancharik Mitra](https://x.com/chancharikm)**  
+  Implemented image-based **CLIPScore** and **VQAScore** for:  
+  CLIP-FlanT5, GPT-4o, LLaVA-1.5, InstructBLIP, OpenCLIP, HPSv2, ImageReward, PickScore.
+
+- **Baiqi Li**  
+  Implemented **GenAI-Bench** and **GenAI-Rank** benchmarks.
+
+- **[Chancharik Mitra](https://x.com/chancharikm)**  
+  Implemented video-based **VQAScore** for:  
+  LLaVA-OneVision, InternVideo2, mPLUG-Owl3, PaLI-Gemma, InternVL2, InternLMXC2.5.
 
 ## Citation
 

diff --git a/SimSun.ttf b/SimSun.ttf
diff --git a/dataset.py b/dataset.py
@@ -826,7 +826,6 @@ def __init__(self,
 
                     # Append the PIL Image path to the list
                     current_frames.append(output_path)
-
                 if len(current_frames) < num_frames:
                     current_frames = current_frames + [current_frames[-1]] * (num_frames - len(current_frames))
                 elif len(current_frames) > num_frames:

diff --git a/eval.py b/eval.py
@@ -41,8 +41,6 @@ def main():
         TIFA160_DSG,
         Pickapic_v1,
         SeeTrue,
-        StanfordT23D,
-        T2VScore,
         Flickr8K_CF,
     ]:
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "pandas>=2.1.4",
     "scipy>=1.11.4",
     "sentencepiece>=0.1.99",
-    "transformers>=4.36.1",
+    "transformers",
     "datasets>=2.15.0",
     "tokenizers",
     "omegaconf",
@@ -40,8 +40,30 @@ dependencies = [
     "pycocoevalcap",
     "image-reward",
     "hpsv2",
+    # Video Model Additional Requirements
+    "llava @ git+https://github.com/LLaVA-VL/LLaVA-NeXT.git",
     "fire==0.4.0",
     "tiktoken>=0.7.0",
+    "peft==0.5.0",
+    "decord",
+    "easydict",
+    "av",
+    "icecream",
+    "markdown2",
+    "pydantic",
+    "accelerate",
+    "shortuuid",
+    "bitsandbytes",
+    "timm",
+    "requests",
+    "httpx==0.24.0",
+    "uvicorn",
+    "einops-exts",
+    "einops",
+    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl",
+    "PyYAML",
+    "wandb",
+    "moviepy"
 ]
 
 [tool.setuptools]

diff --git a/t2v_metrics/models/vqascore_models/__init__.py b/t2v_metrics/models/vqascore_models/__init__.py
@@ -2,7 +2,15 @@
 from .llava_model import LLAVA_MODELS, LLaVAModel
 from .llava16_model import LLAVA16_MODELS, LLaVA16Model
 from .instructblip_model import InstructBLIP_MODELS, InstructBLIPModel
+from .qwenvl_model import QwenVL_MODELS, QwenVLModel
 from .gpt4v_model import GPT4V_MODELS, GPT4VModel
+from .llavaov_model import LLAVA_OV_MODELS, LLaVAOneVisionModel
+from .mplug_model import MPLUG_OWL3_MODELS, mPLUGOwl3Model
+from .paligemma_model import PALIGEMMA_MODELS, PaliGemmaModel
+from .internvl_model import INTERNVL2_MODELS, InternVL2Model
+from .internvideo_model import INTERNVIDEO2_MODELS, InternVideo2Model
+from .internlm_model import INTERNLMXCOMPOSER25_MODELS, InternLMXComposer25Model
+
 from ...constants import HF_CACHE_DIR
 
 ALL_VQA_MODELS = [
@@ -11,8 +19,15 @@
     LLAVA16_MODELS,
     InstructBLIP_MODELS,
     GPT4V_MODELS,
+    LLAVA_OV_MODELS,
+    MPLUG_OWL3_MODELS,
+    PALIGEMMA_MODELS,
+    INTERNVL2_MODELS,
+    INTERNVIDEO2_MODELS,
+    INTERNLMXCOMPOSER25_MODELS
 ]
 
+
 def list_all_vqascore_models():
     return [model for models in ALL_VQA_MODELS for model in models]
 
@@ -28,5 +43,17 @@ def get_vqascore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR, **kwar
         return InstructBLIPModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
     elif model_name in GPT4V_MODELS:
         return GPT4VModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in LLAVA_OV_MODELS:
+        return LLaVAOneVisionModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in MPLUG_OWL3_MODELS:
+        return mPLUGOwl3Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in PALIGEMMA_MODELS:
+        return PaliGemmaModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in INTERNVL2_MODELS:
+        return InternVL2Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in INTERNVIDEO2_MODELS:
+        return InternVideo2Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
+    elif model_name in INTERNLMXCOMPOSER25_MODELS:
+        return InternLMXComposer25Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
     else:
         raise NotImplementedError()