Merge pull request #1111 from JohnSnowLabs/feature/implement-the-supp…

…ort-for-multimodal-with-new-vqa-task Feature/implement the support for multimodal with new vqa task
JohnSnowLabs · Sep 17, 2024 · 67c641d · 67c641d
2 parents d3a4663 + b337d2b
commit 67c641d
Show file tree

Hide file tree

Showing 14 changed files with 731 additions and 4 deletions.
diff --git a/demo/tutorials/llm_notebooks/Visual_QA.ipynb b/demo/tutorials/llm_notebooks/Visual_QA.ipynb
diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py
@@ -95,6 +95,12 @@
         "anti-stereotype": ["anti-stereotype"],
         "unrelated": ["unrelated"],
     },
+    "visualqa": {
+        "image": ["image", "image_1"],
+        "question": ["question"],
+        "options": ["options"],
+        "answer": ["answer"],
+    },
 }
 
 
@@ -183,7 +189,7 @@ def __init__(self, file_path: Union[str, dict], task: TaskManager, **kwargs) ->
             raise ValueError(Errors.E024)
 
         if "data_source" not in file_path:
-            raise ValueError(Errors.E025)
+            raise ValueError(Errors.E025())
         self._custom_label = file_path.copy()
         self._file_path = file_path.get("data_source")
         self._size = None
@@ -1246,6 +1252,7 @@ class HuggingFaceDataset(BaseDataset):
         "summarization",
         "ner",
         "question-answering",
+        "visualqa",
     ]
 
     LIB_NAME = "datasets"
@@ -1709,6 +1716,7 @@ class PandasDataset(BaseDataset):
         "legal",
         "factuality",
         "stereoset",
+        "visualqa",
     ]
     COLUMN_NAMES = {task: COLUMN_MAPPER[task] for task in supported_tasks}
 

diff --git a/langtest/langtest.py b/langtest/langtest.py
@@ -605,6 +605,7 @@ def generated_results(self) -> Optional[pd.DataFrame]:
             "model_name",
             "category",
             "test_type",
+            "original_image",
             "original",
             "context",
             "prompt",
@@ -613,8 +614,10 @@ def generated_results(self) -> Optional[pd.DataFrame]:
             "completion",
             "test_case",
             "perturbed_context",
+            "perturbed_image",
             "perturbed_question",
             "sentence",
+            "question",
             "patient_info_A",
             "patient_info_B",
             "case",
@@ -838,6 +841,7 @@ def testcases(self, additional_cols=False) -> pd.DataFrame:
             "model_name",
             "category",
             "test_type",
+            "original_image",
             "original",
             "context",
             "original_context",
@@ -863,7 +867,9 @@ def testcases(self, additional_cols=False) -> pd.DataFrame:
             "correct_sentence",
             "incorrect_sentence",
             "perturbed_context",
+            "perturbed_image",
             "perturbed_question",
+            "question",
             "ground_truth",
             "options",
             "expected_result",

diff --git a/langtest/modelhandler/llm_modelhandler.py b/langtest/modelhandler/llm_modelhandler.py
@@ -13,6 +13,7 @@
 import logging
 from functools import lru_cache
 from langtest.utils.custom_types.helpers import HashableDict
+from langchain.chat_models.base import BaseChatModel
 
 
 class PretrainedModelForQA(ModelAPI):
@@ -452,3 +453,57 @@ class PretrainedModelForSycophancy(PretrainedModelForQA, ModelAPI):
     """
 
     pass
+
+
+class PretrainedModelForVisualQA(PretrainedModelForQA, ModelAPI):
+    """A class representing a pretrained model for visual question answering.
+
+    Inherits:
+        PretrainedModelForQA: The base class for pretrained models.
+    """
+
+    @lru_cache(maxsize=102400)
+    def predict(
+        self, text: Union[str, dict], prompt: dict, images: List[Any], *args, **kwargs
+    ):
+        """Perform prediction using the pretrained model.
+
+        Args:
+            text (Union[str, dict]): The input text or dictionary.
+            prompt (dict): The prompt configuration.
+            images (List[Any]): The list of images.
+            *args: Additional positional arguments.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing the prediction result.
+                - 'result': The prediction result.
+        """
+        try:
+            if not isinstance(self.model, BaseChatModel):
+                ValueError("visualQA task is only supported for chat models")
+
+            # prepare prompt
+            prompt_template = PromptTemplate(**prompt)
+            from langchain_core.messages import HumanMessage
+
+            images = [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image},
+                }
+                for image in images
+            ]
+
+            messages = HumanMessage(
+                content=[
+                    {"type": "text", "text": prompt_template.format(**text)},
+                    *images,
+                ]
+            )
+
+            response = self.model.invoke([messages])
+            return response.content
+
+        except Exception as e:
+            raise ValueError(Errors.E089(error_message=e))
diff --git a/langtest/tasks/task.py b/langtest/tasks/task.py
@@ -851,3 +851,44 @@ def create_sample(
 
 class FillMask(BaseTask):
     pass
+
+
+class VisualQA(BaseTask):
+    _name = "visualqa"
+    _default_col = {
+        "image": ["image"],
+        "question": ["question"],
+        "answer": ["answer"],
+    }
+    sample_class = samples.VisualQASample
+
+    def create_sample(
+        cls,
+        row_data: dict,
+        image: str = "image_1",
+        question: str = "question",
+        options: str = "options",
+        answer: str = "answer",
+        dataset_name: str = "",
+    ) -> samples.VisualQASample:
+        """Create a sample."""
+        keys = list(row_data.keys())
+
+        # auto-detect the default column names from the row_data
+        column_mapper = cls.column_mapping(keys, [image, question, options, answer])
+
+        options = row_data.get(column_mapper.get(options, "-"), "-")
+
+        if len(options) > 3 and options[0] == "[" and options[-1] == "]":
+            options = ast.literal_eval(row_data[column_mapper["options"]])
+            options = "\n".join(
+                [f"{chr(65 + i)}. {option}" for i, option in enumerate(options)]
+            )
+
+        return samples.VisualQASample(
+            original_image=row_data[column_mapper[image]],
+            question=row_data[column_mapper[question]],
+            options=options,
+            expected_result=row_data[column_mapper[answer]],
+            dataset_name=dataset_name,
+        )
diff --git a/langtest/transform/__init__.py b/langtest/transform/__init__.py
@@ -22,6 +22,8 @@
 from langtest.transform.grammar import GrammarTestFactory
 from langtest.transform.safety import SafetyTestFactory
 
+from langtest.transform import image
+
 # Fixing the asyncio event loop
 nest_asyncio.apply()
 
@@ -47,4 +49,5 @@
     SycophancyTestFactory,
     GrammarTestFactory,
     SafetyTestFactory,
+    image,
 ]
diff --git a/langtest/transform/image/__init__.py b/langtest/transform/image/__init__.py
@@ -0,0 +1,3 @@
+from .robustness import ImageResizing, ImageRotation, ImageBlur, ImageNoise
+
+__all__ = [ImageResizing, ImageRotation, ImageBlur, ImageNoise]