From ecd88d88f0cf36c703016912f60b0cf19e81bbc7 Mon Sep 17 00:00:00 2001
From: Colin Wang <zw1300@princeton.edu>
Date: Tue, 23 Jul 2024 21:59:17 -0400
Subject: [PATCH] update v1

---
 .gitignore                     |   7 +-
 README.md                      | 112 +++++++++++------------
 evaluate.sh                    |  16 ++++
 generate.sh                    |  20 ++++
 run.sh                         |  22 -----
 src/generate.py                |  48 +++-------
 src/generate_lib/cambrian.py   |  75 +++++++++++++++
 src/generate_lib/chartgemma.py |  30 ++++++
 src/generate_lib/claude.py     |  44 +++++++++
 src/generate_lib/deepseekvl.py |  55 +++++++++++
 src/generate_lib/gemini.py     |  38 ++++++++
 src/generate_lib/gpt.py        |  70 ++++++++++++++
 src/generate_lib/idefics2.py   |  28 ++++++
 src/generate_lib/internvl15.py | 106 ++++++++++++++++++++++
 src/generate_lib/internvl2.py  | 138 ++++++++++++++++++++++++++++
 src/generate_lib/ixc2.py       |  24 +++++
 src/generate_lib/llava16.py    |  31 +++++++
 src/generate_lib/mgm.py        | 112 +++++++++++++++++++++++
 src/generate_lib/minicpm.py    |  28 ++++++
 src/generate_lib/moai.py       |  38 ++++++++
 src/generate_lib/paligemma.py  |  33 +++++++
 src/generate_lib/phi3.py       |  30 ++++++
 src/generate_lib/qwen.py       |  25 +++++
 src/generate_lib/reka.py       |  27 ++++++
 src/generate_lib/sphinx2.py    |  14 +++
 src/generate_lib/utils.py      | 136 ++++++++++++++++++++++++++++
 src/generate_lib/vila15.py     |  80 ++++++++++++++++
 src/get_score.py               |  20 ----
 src/get_stats.py               | 122 +++++++++++++++++++++++++
 src/score_utils.py             | 161 +++++++++++++++++++++++++++++++++
 30 files changed, 1552 insertions(+), 138 deletions(-)
 create mode 100644 evaluate.sh
 create mode 100644 generate.sh
 delete mode 100644 run.sh
 create mode 100644 src/generate_lib/cambrian.py
 create mode 100644 src/generate_lib/chartgemma.py
 create mode 100644 src/generate_lib/claude.py
 create mode 100644 src/generate_lib/deepseekvl.py
 create mode 100644 src/generate_lib/gemini.py
 create mode 100644 src/generate_lib/gpt.py
 create mode 100644 src/generate_lib/idefics2.py
 create mode 100644 src/generate_lib/internvl15.py
 create mode 100644 src/generate_lib/internvl2.py
 create mode 100644 src/generate_lib/ixc2.py
 create mode 100644 src/generate_lib/llava16.py
 create mode 100644 src/generate_lib/mgm.py
 create mode 100644 src/generate_lib/minicpm.py
 create mode 100644 src/generate_lib/moai.py
 create mode 100644 src/generate_lib/paligemma.py
 create mode 100644 src/generate_lib/phi3.py
 create mode 100644 src/generate_lib/qwen.py
 create mode 100644 src/generate_lib/reka.py
 create mode 100644 src/generate_lib/sphinx2.py
 create mode 100644 src/generate_lib/utils.py
 create mode 100644 src/generate_lib/vila15.py
 delete mode 100644 src/get_score.py
 create mode 100644 src/get_stats.py
 create mode 100644 src/score_utils.py

diff --git a/.gitignore b/.gitignore
index 75df521..3fc5266 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,5 +161,10 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 temp/*
+results/*
+slurm/*
 images/*.jpg
-images/*.zip
\ No newline at end of file
+images/*.zip
+internal_generate.sh
+internal_evaluate.sh
+slurm_submit.sh
\ No newline at end of file
diff --git a/README.md b/README.md
index d29e2d3..fa16c06 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # 📊 CharXiv
-🏠[Home](https://charxiv.github.io/) | 🤗[Data](https://huggingface.co/datasets/princeton-nlp/CharXiv) | 🥇[Leaderboard](https://charxiv.github.io/#leaderboard) | 📄Paper (Soon!) | *Current Version: Beta*
+🏠[Home](https://charxiv.github.io/) | 🤗[Data](https://huggingface.co/datasets/princeton-nlp/CharXiv) | 🥇[Leaderboard](https://charxiv.github.io/#leaderboard) | 📄[Paper](https://arxiv.org/abs/2406.18521) | *Current Version: v1.0*
 
-This repository contains the code to evaluate models on CharXiv from the paper CharXiv: Charting Gaps in Realistic Chart Understanding in Multimodal LLMs.
+This repository contains the code to evaluate models on CharXiv from the paper [CharXiv: Charting Gaps in Realistic Chart Understanding in Multimodal LLMs](https://arxiv.org/abs/2406.18521).
 
 *🤗 We are first-time evaluation suite builders and this codebase is released for the first-time. We are committed to improving it. If you have any questions, feel free to raise issues and/or submit pull requests for new features of bug fixes.*
 
@@ -9,6 +9,12 @@ This repository contains the code to evaluate models on CharXiv from the paper C
 
 https://github.com/princeton-nlp/CharXiv/assets/59942464/ab9b293b-8fd6-4735-b8b3-0079ee978b61
 
+## 📰 News
+🆕 [07/26/2024] Upcoming this week: we'll be releasing scores for [GPT-4o-mini](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/) as well as the largest and most capable open-weight VLM in our benchmark: [InternVL2 LLaMA-3 76B](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B). Alongside scores, we find some interesting patterns in the trend of model improvement with respect to differnet chart understanding benchmarks on X. Stay tuned!  
+🆕 [07/24/2024] We released the [full evaluation pipeline](https://github.com/princeton-nlp/CharXiv) (i.e., v1.0).  
+🆕 [07/23/2024] We released our [evaluation results](https://huggingface.co/datasets/princeton-nlp/CharXiv/tree/main/existing_evaluations) on **all 34 MLLMs** that we have tested so far -- this includes all models' responses to CharXiv's challenging questions, scores graded by GPT-4o, as well as aggregated stats.   
+🆕 [07/14/2024] We further evaluated the latest [InternVL Chat V2.0 26B](https://huggingface.co/OpenGVLab/InternVL2-26B) and [Cambrian 34B models](https://huggingface.co/nyu-visionx/cambrian-34b) on CharXiv with some State-of-the-Art results. More analysis are [here](https://x.com/zwcolin/status/1812650435808792731).
+
 ## 👋 Introduction
 Chart understanding plays a pivotal role when applying Multimodal Large Language Models (MLLMs) to real-world tasks such as analyzing scientific papers or financial reports. However, existing datasets often focus on oversimplified and homogeneous charts with template-based questions, leading to an over-optimistic measure of progress. In this work, we propose CharXiv, a comprehensive evaluation suite involving 2,323 natural, challenging, and diverse charts from scientific papers. CharXiv includes two types of questions: (1) descriptive questions about examining basic chart elements and (2) reasoning questions that require synthesizing information across complex visual elements in the chart. To ensure quality, all charts and questions are handpicked, curated, and verified by human experts. Our results reveal a substantial, previously underestimated gap between the reasoning skills of the strongest proprietary model (i.e., GPT-4o), which achieves 47.1% accuracy, and the strongest open-source model (i.e., InternVL Chat V1.5), which achieves 29.2%. All models lag far behind human performance of 80.5%, underscoring weaknesses in the chart understanding capabilities of existing MLLMs. We hope CharXiv facilitates future research on MLLM chart understanding by providing a more realistic and faithful measure of progress.
 
@@ -26,37 +32,6 @@ unzip images.zip && rm images.zip
 
 <details>
 <summary>  (Optional) A short tour for the codebase </summary>
-
-```
-.
-├── data/
-│   ├── descriptive_test.json
-│   ├── descriptive_val.json
-│   ├── image_metadata_test.json
-│   ├── image_metadata_val.json
-│   ├── reasoning_test.json
-│   ├── reasoning_val.json
-│   ├── README.md
-│   └── LICENSE
-├── images/
-│   ├── 0.jpg
-│   ├── ...
-│   ├── 2399.jpg
-│   └── README.md
-├── results/
-│   └── README.md
-├── src/
-│   ├── constants.py
-│   ├── descriptive_utils.py
-│   ├── reasoning_utils.py
-│   ├── evaluate.py
-│   ├── generate.py
-│   └── get_score.py
-├── run.sh
-├── README.md
-├── LICENSE
-└── .gitignore
-```
 * `data` folder contains all QAs and metadata for images, descriptive questions, and reasoning questions. Answers for the test split are intentionally made to `null` to prevent testing data from leaking into the public.  
 * `images` folder contains all images where their identifiers range from 0 to 2399. Note that there are only 2333 images in total and the numberings are **not** consecutive.  
 * `results` folder contains all response generation and scoring results.  
@@ -66,7 +41,8 @@ unzip images.zip && rm images.zip
   * `reasoning_utils.py` contains all code to build queries for response generation and grading, as well as saving all artifacts for reasoning questions.  
   * `evaluate.py` is the main function to evaluate model responses against the answer with gpt API calls.  
   * `generate.py` is the main function to loop QAs for model to generate responses.  
-  * `get_score.py` is the main function to print the reasoning and descriptive question scores.
+  * `get_stats.py` is the main function to print the reasoning and descriptive question statistics.
+  * `generate_lib` contains a series of implementations that enable one to generate response on their models.
 * `run.sh` is the script to evaluate models
 
 
@@ -87,23 +63,35 @@ CharXiv doesn't require any third-party python library when prompting your model
     },
 }
 ```
-Once you load your models and all preprocessing functions, simply implement the `evaluate` function in `src/generate.py`:
+
+To run CharXiv for your model, go to `src/generate_lib/` directory and create a new python file e.g.,`custom.py`. You can put whatever auxiliary code (e.g., model split function, image processing function, etc) inside this file, but you need to implement the `generate_response` function which takes in `model_path` and `queries` by default. Inside the function, you load your models and all preprocessing functions, and let your model generate responses. The below script is a simple example of usage. If you need more references, you can look at existing `*.py` files how these functions are implemented for different models.
+
+
 ```py
 for k in tqdm(queries):
-    query = queries[k]['question']
-    image = queries[k]["figure_path"]
+    query = queries[k]['question'] # This will be a single question with instructions
+    image = queries[k]["figure_path"] # This will be the path to the figure associated with the above query
     query, image = preprocess(query, image) #TODO
     response = model.chat(query, image) #TODO
     queries[k]['response'] = response
 ```
 
-To generate model responses:
+Once you finish implementing the `generate_response` function, go to `src/generate_lib/utils.py` and modify the `get_generate_fn` to include the function you implemented. Say if your model's checkpoint folder is called `custom_1` and you implement the `generate_response` function in the `custom.py` file, then  all you need to do it to add the follwing code:
+
+```py
+...
+elif model_name in ['custom_1']:
+    from .custom import generate_response
+...
+```
+
+Then, to generate model responses:
 ```bash
-python generate.py \
-   --model_name $model_name \
-   --split $split \
-   --mode $mode \
-   --model_path $model_path #custom arg
+python src/generate.py \
+    --model_name $model_name \
+    --split $split \
+    --mode $mode \
+    --model_path $model_path 
 ```
 ⚠️**The first three arguments are required and you should not delete them**. It is your responsibility to ensure the correctness and integrity of the evaluation pipeline if you change them. In particular,
 
@@ -113,17 +101,19 @@ python generate.py \
 
 * Finally, you should either choose `descriptive` or `reasoning` for the `--mode` argument.
 
-✅The last argument i.e., `--model_path` is a custom argument and feel free to delete it, modify it or add more args as you see fit.
+✅The last argument i.e., `--model_path` is a custom argument and feel free to delete it, modify it or add more args as you see fit. Correspondingly, you should consider changing the input argument of your `generate_response` function and the function that calls `generate_response` if you change `--model_path`.
 
 🗄️ Once the process ends, you will find a file in `results` folder named:
 `gen-<model_name>-<mode>_<split>.json`. This file stores your model's responses.
 
+Note: if you are evaluating a model that is **hosted on cloud and can only be accessed via an API**, the `--model_path` argument will correspond to the name of the model e.g., `gpt-4o-2024-05-13`. Also, in creating the custom file in the `src/generate_lib` directory, you need to implement an additional function i.e., `get_client_model` that takes in the `model_path` argument and the `api_key` argument. In addition, you need to add another `elif` statement in `get_client_fn` inside `src/generate_lib/utils.py` with instructions similar to the above. Specific instructions to implement `get_client_model` function differ by API providers, and examples are provided in `gpt.py`, `gemini.py`, `reka.py`, `claude.py`, and `qwen.py`.
+
 ### Evaluation
 
 ⚠️ Make sure the openai python API library is installed in your evaluation environment. If not, `pip install openai` first.
 
 ```bash
-python evaluate.py \
+python src/evaluate.py \
     --model_name $model_name \
     --split $split \
     --mode $mode \
@@ -134,18 +124,18 @@ The first three arguments are same as the above (response generation), except th
 
 This python script will automatically match the `gen-<model_name>-<mode>_<split>.json` file in the `results` folder and the `<mode>_<split>.json` file in the `data` folder.
 
-🗄️ Once the process ends, you will find a file in results folder named: `scores-<model_name>-<mode>_<split>.json`. This file stores your model's evaluation results.
+🗄️ Once the process ends, you will find a file in results folder named: `scores-<model_name>-<mode>_<split>.json`. This file stores your model's evaluation results graded by LLMs.
 
 Finally, run this:
 ```bash
-python get_score.py \
+python src/get_stats.py \
     --model_name $model_name \
-    --split $split \
-    --mode $mode \
+    --split $split 
 ```
-This python script will automatically match the `scores-<model_name>-<mode>_<split>.json` file in the `results` folder to calculate the score and print the score in your console.
+This python script will automatically match the `scores-<model_name>-<mode>_<split>.json` file in the `results` folder to calculate the stats for aggregated scores. Calling this function will automatically generate `stats-<model_name>-<mode>_<split>.json` in the `results` folder.
 
-Note: we will be updating the `get_score.py` file soon to include more metrics (i.e., metrics we reported in our paper).
+## 📅 Results from Existing Models
+We release full results on the validation set (i.e., generated responses, grading done by LLMs and the aggregated stats) for all models we tested in our [HuggingFace Repo](https://huggingface.co/datasets/princeton-nlp/CharXiv/tree/main/existing_evaluations). If you are interested in doing some fine-grained analysis on these results or calculate some customized metrics, feel free to use them.
 
 ## 📜 License
 Our original data contributions (all data except the charts) are distributed under the [CC BY-SA 4.0](data/LICENSE) license. Our code is licensed under [Apache 2.0](LICENSE) license. The copyright of the charts belong to the original authors, where you can find the source in `image_metadata_val.json` and `image_metadata_test.json` under the data folder.
@@ -153,15 +143,21 @@ Our original data contributions (all data except the charts) are distributed und
 
 ## 🥺 Cite
 If you use our work and are inspired by our work, please consider cite us (available soon):
-```
+```bibtex
+@article{wang2024charxiv,
+  title={CharXiv: Charting Gaps in Realistic Chart Understanding in Multimodal LLMs},
+  author={Wang, Zirui and Xia, Mengzhou and He, Luxi and Chen, Howard and Liu, Yitao and Zhu, Richard and Liang, Kaiqu and Wu, Xindi and Liu, Haotian and Malladi, Sadhika and Chevalier, Alexis and Arora, Sanjeev and Chen, Danqi},
+  journal={arXiv preprint arXiv:2406.18521},
+  year={2024}
+}
 ```
 ## 🙌 Contributors and Acknowledgement
-**📊 CharXiv is built by a team consisting of:**  
-Zirui Wang, Mengzhou Xia, Luxi He, Howard Chen, Yitao Liu, Richard Zhu, Kaiqu Liang, Xindi Wu, Haotian Liu, Sadhika Malladi, Alexis Chevalier, Sanjeev Arora, Danqi Chen
+**📊 [CharXiv](https://charxiv.github.io/) is built by a team consisting of:**  
+[Zirui Wang](https://zwcolin.github.io/), [Mengzhou Xia](https://xiamengzhou.github.io/), [Luxi He](https://twitter.com/luxihelucy), [Howard Chen](https://howard50b.github.io/), [Yitao Liu](https://yitaoliu17.com/), [Richard Zhu](https://richardzhu123.github.io/), [Kaiqu Liang](https://kaiquliang.github.io/), [Xindi Wu](https://xindiwu.github.io/), [Haotian Liu](https://hliu.cc/), [Sadhika Malladi](https://www.cs.princeton.edu/~smalladi/), [Alexis Chevalier](https://pli.princeton.edu/people/alexis-chevalier), [Sanjeev Arora](https://www.cs.princeton.edu/~arora/), [Danqi Chen](https://www.cs.princeton.edu/~danqic/)
 
-Princeton Language and Intelligence, Princeton University  
-University of Wisconsin, Madison  
-The University of Hong Kong.
+[Princeton Language and Intelligence, Princeton University](https://pli.princeton.edu/)  
+[University of Wisconsin, Madison](https://www.wisc.edu/)  
+[The University of Hong Kong](https://www.hku.hk/)
 
 🤗 We adapted part of the [MathVista](https://github.com/lupantech/MathVista)'s codebase in building our evaluation framework, and we greatly appreciate their contributions to the MLLM community.  
-🤗 The lyrics in the teaser video are created by GPT-4o from our abstract, and the music is created by Suno. Video is manually edited using CapCut.
+🤗 The lyrics in the teaser video are created by [GPT-4o](https://openai.com/index/hello-gpt-4o/) from our abstract, and the music is created by [Suno](https://suno.com/). Video is manually edited using [CapCut](https://www.capcut.com/).
diff --git a/evaluate.sh b/evaluate.sh
new file mode 100644
index 0000000..ca1833d
--- /dev/null
+++ b/evaluate.sh
@@ -0,0 +1,16 @@
+model_name=YOUR_MODEL # custom name for model
+split=val # choose from [val, test]
+mode=reasoning # choose from [reasoning, descriptive]
+openai_key=YOUR_KEY # OpenAI API key for scoring (e.g., used in src/evaluate.py)
+
+### Query GPT-4o to grade responses ###
+python src/evaluate.py \
+    --model_name $model_name \
+    --split $split \
+    --mode $mode \
+    --api_key $openai_key
+
+### Get statistics for the model performance ###
+python src/get_stats.py \
+    --model_name $model_name \
+    --split $split 
diff --git a/generate.sh b/generate.sh
new file mode 100644
index 0000000..66a4357
--- /dev/null
+++ b/generate.sh
@@ -0,0 +1,20 @@
+model_name=YOUR_MODEL # custom name for model
+split=val # choose from [val, test]
+mode=reasoning # choose from [reasoning, descriptive]
+model_path=YOUR_CKPT # path to the model weights
+model_api=YOUR_KEY # API key IFF testing proprietary models
+
+### generate response for open-weight models ###
+python src/generate.py \
+    --model_name $model_name \
+    --split $split \
+    --mode $mode \
+    --model_path $model_path 
+
+### generate response for proprietary models ###
+# python src/generate.py \
+#     --model_name $model_name \
+#     --split $split \
+#     --mode $mode \
+#     --model_path $model_path \
+#     --model_api $model_api
diff --git a/run.sh b/run.sh
deleted file mode 100644
index 0cd5bdd..0000000
--- a/run.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-model_name=my_model # custom name for the model
-openai_key=my_key # OpenAI API key
-split=val # choose from val, test
-mode=reasoning # choose from reasoning, descriptive
-model_path="your_path" # path to the model, customizable argument
-
-python src/generate.py \
-    --model_name $model_name \
-    --split $split \
-    --mode $mode \
-    --model_path $model_path
-
-python src/evaluate.py \
-    --model_name $model_name \
-    --split $split \
-    --mode $mode \
-    --api_key $openai_key
-
-python src/get_score.py \
-    --model_name $model_name \
-    --split $split \
-    --mode $mode
diff --git a/src/generate.py b/src/generate.py
index 8f1f5d7..e43fd11 100644
--- a/src/generate.py
+++ b/src/generate.py
@@ -1,36 +1,7 @@
-import os, json, argparse
-from tqdm import tqdm
-
-# sample code to evaluate the IXC2 4khd model
-# https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b
-def demo(queries, model_path=None):
-    import torch
-    from transformers import AutoModel, AutoTokenizer
-    assert model_path is not None, "Model path is required for demo"
-    torch.set_grad_enabled(False)
-    model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, 
-                                      trust_remote_code=True).cuda().eval()
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    for k in tqdm(queries):
-        query = '<ImageHere>' + queries[k]['question']
-        image = queries[k]["figure_path"]
-        with torch.cuda.amp.autocast():
-            response, _ = model.chat(tokenizer, query=query, image=image, 
-                                     hd_num=16, history=[], do_sample=False)
-        queries[k]['response'] = response
-
-def evaluate(queries):
-    """Evaluate the model on the given queries.
-
-    Parameters:
-    queries (dict): Dictionary of queries to evaluate. Each query should have the following keys:
-        - figure_path (str): Path to the image file
-        - question (str): Question to ask about the image
-    
-    Returns:
-    None
-    """
-    raise NotImplementedError("Implement your own evaluation pipeline based on your model design")
+import os
+import json
+import argparse
+from generate_lib.utils import get_generate_fn, get_client_fn, generate_response_remote_wrapper
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -49,6 +20,7 @@ def evaluate(queries):
 
     # custom arguments
     parser.add_argument('--model_path', type=str, required=True)
+    parser.add_argument('--model_api', type=str, required=False, default=None)
     args = parser.parse_args()
 
     input_file = os.path.join(args.data_dir, f"{args.mode}_{args.split}.json")
@@ -58,7 +30,6 @@ def evaluate(queries):
 
     # output file
     os.makedirs(args.output_dir, exist_ok=True)
-    assert '-' not in args.model_name, "Model name cannot contain '-'"
     output_file = os.path.join(args.output_dir, 
             f'gen-{args.model_name}-{args.mode}_{args.split}.json')
 
@@ -75,9 +46,12 @@ def evaluate(queries):
     print("Evaluation mode:", args.mode)
     print("Output file:", output_file)
 
-    # switch to demo(queries, model_path) for IXC2 4khd model
-    demo(queries, model_path=args.model_path)
-    # evaluate(queries) 
+    generate_fn = get_generate_fn(args.model_path)
+    if args.model_api is not None:
+        client, model = get_client_fn(args.model_path)(args.model_path, args.model_api)
+        generate_response_remote_wrapper(generate_fn, queries, model, args.model_api, client)
+    else:
+        generate_fn(args.model_path, queries)
 
     for k in queries:
         queries[k].pop("figure_path", None)
diff --git a/src/generate_lib/cambrian.py b/src/generate_lib/cambrian.py
new file mode 100644
index 0000000..416c081
--- /dev/null
+++ b/src/generate_lib/cambrian.py
@@ -0,0 +1,75 @@
+# Adapted from https://github.com/cambrian-mllm/cambrian/blob/main/inference.py
+# This has support for the Cambrian 34B model
+
+import os
+vlm_codebase = os.environ['VLM_CODEBASE_DIR']
+
+import sys
+sys.path.append(vlm_codebase + '/cambrian')
+
+import random
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from cambrian.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from cambrian.conversation import conv_templates
+from cambrian.model.builder import load_pretrained_model
+from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+
+def generate_response(model_path, queries):
+    conv_mode = "chatml_direct"
+    def process(image, question, tokenizer, image_processor, model_config):
+        qs = question
+
+        if model_config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        
+        image_size = [image.size]
+        image_tensor = process_images([image], image_processor, model_config)
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        return input_ids, image_tensor, image_size, prompt
+
+    seed = 42
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
+
+    temperature = 0
+
+    for k in tqdm(queries):
+        image_path = queries[k]['figure_path']
+        image = Image.open(image_path).convert('RGB')
+        question = queries[k]['question']
+
+        input_ids, image_tensor, image_sizes, prompt = process(image, question, tokenizer, image_processor, model.config)
+        input_ids = input_ids.to(device='cuda', non_blocking=True)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor,
+                image_sizes=image_sizes,
+                do_sample=True if temperature > 0 else False,
+                temperature=temperature,
+                num_beams=1,
+                max_new_tokens=512,
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        queries[k]['response'] = outputs
diff --git a/src/generate_lib/chartgemma.py b/src/generate_lib/chartgemma.py
new file mode 100644
index 0000000..4337db5
--- /dev/null
+++ b/src/generate_lib/chartgemma.py
@@ -0,0 +1,30 @@
+# Adapted from https://huggingface.co/ahmed-masry/chartgemma
+# This has support for the ChartGemma model
+
+from PIL import Image
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+import torch
+from tqdm import tqdm
+
+def generate_response(queries, model_path=None):
+    # Load Model
+    model = PaliGemmaForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+    processor = AutoProcessor.from_pretrained(model_path)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+
+    for k in tqdm(queries):
+        image_path = queries[k]['figure_path']
+        input_text = queries[k]['question']
+
+        # Process Inputs
+        image = Image.open(image_path).convert('RGB')
+        inputs = processor(text=input_text, images=image, return_tensors="pt")
+        prompt_length = inputs['input_ids'].shape[1]
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+
+        # Generate
+        generate_ids = model.generate(**inputs, num_beams=4, max_new_tokens=512)
+        output_text = processor.batch_decode(generate_ids[:, prompt_length:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        queries[k]['response'] = output_text
diff --git a/src/generate_lib/claude.py b/src/generate_lib/claude.py
new file mode 100644
index 0000000..8a5b8ca
--- /dev/null
+++ b/src/generate_lib/claude.py
@@ -0,0 +1,44 @@
+import anthropic
+import base64
+import json
+
+def get_client_model(model_path, api_key):
+    assert api_key is not None, "API key is required for using Claude"
+    assert model_path is not None, "Model name is required for using Claude"
+    client = anthropic.Anthropic(api_key=api_key)
+    model = model_path
+    return client, model
+
+def generate_response(image_path, query, model, media_type="image/jpeg", api_key=None, client=None, random_baseline=False):
+    def encode_image(image_path):
+            with open(image_path, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode('utf-8')
+    image = encode_image(image_path)
+    message = client.messages.create(
+        model=model,
+        max_tokens=1024,
+        temperature=0.0,
+        top_p=1.0,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": media_type,
+                            "data": image,
+                        },
+                    },
+                    {
+                        "type": "text",
+                        "text": query,
+                    }
+                ],
+            }
+        ],
+    )
+    message = message.json()
+    message = json.loads(message)
+    return message['content'][0]['text']
\ No newline at end of file
diff --git a/src/generate_lib/deepseekvl.py b/src/generate_lib/deepseekvl.py
new file mode 100644
index 0000000..11b9e85
--- /dev/null
+++ b/src/generate_lib/deepseekvl.py
@@ -0,0 +1,55 @@
+# Adapted from https://github.com/deepseek-ai/DeepSeek-VL?tab=readme-ov-file#simple-inference-example
+# This has support for the DeepSeek-VL model
+
+from transformers import AutoModelForCausalLM
+from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
+from deepseek_vl.utils.io import load_pil_images
+
+import torch
+from tqdm import tqdm
+
+def generate_response(model_path, queries):
+    # specify the path to the model
+    vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
+    tokenizer = vl_chat_processor.tokenizer
+
+    vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+    vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
+    for k in tqdm(queries):
+        image_path, query = queries[k]["figure_path"], queries[k]['question']
+
+        ## single image conversation example
+        conversation = [
+            {
+                "role": "User",
+                "content": f"<image_placeholder>{query}",
+                "images": [f"{image_path}"],
+            },
+            {"role": "Assistant", "content": ""},
+        ]
+
+        # load images and prepare for inputs
+        pil_images = load_pil_images(conversation)
+        prepare_inputs = vl_chat_processor(
+            conversations=conversation,
+            images=pil_images,
+            force_batchify=True
+        ).to(vl_gpt.device)
+
+        # run image encoder to get the image embeddings
+        inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+
+        # run the model to get the response
+        outputs = vl_gpt.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=tokenizer.eos_token_id,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            max_new_tokens=512,
+            do_sample=False,
+            use_cache=True
+        )
+
+        answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+        queries[k]['response'] = answer
diff --git a/src/generate_lib/gemini.py b/src/generate_lib/gemini.py
new file mode 100644
index 0000000..f5f8359
--- /dev/null
+++ b/src/generate_lib/gemini.py
@@ -0,0 +1,38 @@
+from PIL import Image
+import google.generativeai as genai
+from google.generativeai.types import HarmCategory, HarmBlockThreshold
+import time
+
+def get_client_model(model_path, api_key):
+    assert api_key is not None, "API key is required for using Gemini"
+    assert model_path is not None, "Model name is required for using Gemini"
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel(model_path)
+    client = None
+    return client, model
+
+def generate_response(image_path, query, model, media_type="image/jpeg", api_key=None, client=None, random_baseline=False):
+    # Load from local file
+    if 'gemini-1.5-pro-latest' in model.model_name:
+        time.sleep(0)
+    elif 'gemini-1.0' in model.model_name:
+        time.sleep(0)
+    elif 'flash' in model.model_name:
+        time.sleep(0)
+    image = Image.open(image_path)
+    contents = [image, query]
+    response = model.generate_content(contents, stream=True,
+    generation_config=genai.types.GenerationConfig(
+    candidate_count=1,
+    max_output_tokens=1000,
+    temperature=0.0,
+    top_p=1.0,
+    ),
+    safety_settings={
+        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE}
+)
+    response.resolve()
+    return response.text
\ No newline at end of file
diff --git a/src/generate_lib/gpt.py b/src/generate_lib/gpt.py
new file mode 100644
index 0000000..1298753
--- /dev/null
+++ b/src/generate_lib/gpt.py
@@ -0,0 +1,70 @@
+import base64
+import requests
+
+def get_client_model(model_path, api_key):
+    assert api_key is not None, "API key is required for using GPT"
+    assert model_path is not None, "Model name is required for using GPT"
+    model = model_path
+    client = None
+    return client, model
+
+def generate_response(image_path, query, model, media_type="image/jpeg", api_key=None, client=None, random_baseline=False):
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    # Getting the base64 string
+    base64_image = encode_image(image_path)
+
+    headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {api_key}"
+    }
+    if not random_baseline:
+        payload = {
+        "model": model,
+        "messages": [
+            {
+            "role": "user",
+            "content": [
+                {
+                "type": "text",
+                "text": query
+                },
+                {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{base64_image}"
+                }
+                }
+            ]
+            }
+        ],
+        "max_tokens": 1000,
+        "temperature": 0.0,
+        "top_p": 1.0,
+        "seed": 42
+        }
+    else:
+        payload = {
+        "model": model,
+        "messages": [
+            {
+            "role": "user",
+            "content": [
+                {
+                "type": "text",
+                "text": query
+                }
+            ]
+            }
+        ],
+        "max_tokens": 1000,
+        "temperature": 0.0,
+        "top_p": 1.0,
+        "seed": 42
+        }
+
+    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+    response = response.json()
+    return response['choices'][0]['message']['content']
\ No newline at end of file
diff --git a/src/generate_lib/idefics2.py b/src/generate_lib/idefics2.py
new file mode 100644
index 0000000..7b19fcd
--- /dev/null
+++ b/src/generate_lib/idefics2.py
@@ -0,0 +1,28 @@
+# Adapted from https://huggingface.co/HuggingFaceM4/idefics2-8b
+# This has support for all the IDEFICS2 models
+from transformers.image_utils import load_image
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from tqdm import tqdm
+
+def generate_response(model_path, queries):
+    model = AutoModelForVision2Seq.from_pretrained(model_path).to('cuda')
+    processor = AutoProcessor.from_pretrained(model_path)
+    for k in tqdm(queries):
+        query = queries[k]['question']
+        image = load_image(queries[k]["figure_path"])
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": f"{query}"},
+                ]
+            }  
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=[image], return_tensors="pt")
+        inputs = {k: v.to('cuda') for k, v in inputs.items()}
+        generated_ids = model.generate(**inputs, max_new_tokens=500, do_sample=False)
+        generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        response = generated_texts[0].split("Assistant:")[-1].strip()
+        queries[k]['response'] = response
diff --git a/src/generate_lib/internvl15.py b/src/generate_lib/internvl15.py
new file mode 100644
index 0000000..2be19bc
--- /dev/null
+++ b/src/generate_lib/internvl15.py
@@ -0,0 +1,106 @@
+# Adapted from https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5
+# This has support for all InternVL 1.5 models
+
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torchvision.transforms as T
+from PIL import Image
+
+from torchvision.transforms.functional import InterpolationMode
+from tqdm import tqdm
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, max_num=6):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+def generate_response(model_path, queries):
+    model = AutoModel.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True).eval().cuda()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    generation_config = dict(
+        num_beams=1,
+        max_new_tokens=512,
+        do_sample=False,
+    )
+    for k in tqdm(queries):
+        query = queries[k]['question']
+        pixel_values = load_image(queries[k]["figure_path"], max_num=9).to(torch.bfloat16).cuda()
+        response = model.chat(tokenizer, pixel_values, query, generation_config)
+        queries[k]['response'] = response
diff --git a/src/generate_lib/internvl2.py b/src/generate_lib/internvl2.py
new file mode 100644
index 0000000..cc1e7f4
--- /dev/null
+++ b/src/generate_lib/internvl2.py
@@ -0,0 +1,138 @@
+# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B
+# This has support for all InternVL2 models
+
+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+from tqdm import tqdm
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, max_num=6):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+def split_model(model_name):
+    device_map = {}
+    world_size = torch.cuda.device_count()
+    num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
+                  'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
+    # Since the first GPU will be used for ViT, treat it as half a GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * world_size
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = i
+            layer_cnt += 1
+    device_map['vision_model'] = 0
+    device_map['mlp1'] = 0
+    device_map['language_model.model.tok_embeddings'] = 0
+    device_map['language_model.model.embed_tokens'] = 0
+    device_map['language_model.output'] = 0
+    device_map['language_model.model.norm'] = 0
+    device_map['language_model.lm_head'] = 0
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
+
+    return device_map
+
+def generate_response(queries, model_path=None):
+    device_map = split_model(model_path.split('/')[-1])
+    print(device_map)
+    model = AutoModel.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+        device_map=device_map).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    generation_config = dict(
+            num_beams=1,
+            max_new_tokens=1024,
+            do_sample=False,
+        )
+
+    for k in tqdm(queries):
+        image_path = queries[k]['figure_path']
+        question = queries[k]['question']
+        pixel_values = load_image(image_path, max_num=9).to(torch.bfloat16).cuda()
+        response = model.chat(tokenizer, pixel_values, question, generation_config)
+        queries[k]['response'] = response
diff --git a/src/generate_lib/ixc2.py b/src/generate_lib/ixc2.py
new file mode 100644
index 0000000..a96476b
--- /dev/null
+++ b/src/generate_lib/ixc2.py
@@ -0,0 +1,24 @@
+# Adapted from https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b and https://huggingface.co/internlm/internlm-xcomposer2-vl-7b
+# This has support for all InternLM-XComposer2 and InternLM-XComposer2-4KHD models
+import torch
+from transformers import AutoModel, AutoTokenizer
+from tqdm import tqdm
+
+def generate_response(model_path, queries):
+    # taken from: 
+    torch.set_grad_enabled(False)
+    if '4khd' in model_path:
+        model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda().eval()
+    else:
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True).cuda().eval()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    for k in tqdm(queries):
+        query = '<ImageHere>' + queries[k]['question']
+        image = queries[k]["figure_path"]
+        with torch.cuda.amp.autocast():
+            if '4khd' in model_path:
+                # set hd_num to 16 for up to 1344^2 support: https://github.com/InternLM/InternLM-XComposer/issues/252#issuecomment-2049507385
+                response, _ = model.chat(tokenizer, query=query, image=image, hd_num=16, history=[], do_sample=False)
+            else:
+                response, _ = model.chat(tokenizer, query=query, image=image, history=[], do_sample=False)
+        queries[k]['response'] = response
diff --git a/src/generate_lib/llava16.py b/src/generate_lib/llava16.py
new file mode 100644
index 0000000..2865832
--- /dev/null
+++ b/src/generate_lib/llava16.py
@@ -0,0 +1,31 @@
+# Adapted from https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf and https://huggingface.co/llava-hf/llava-v1.6-34b-hf
+# This has support for all Llava 1.6 Mistral 7B and Yi 34B models
+
+import torch
+from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+from tqdm import tqdm
+from PIL import Image
+
+def generate_responses(model_path, queries):
+    # taken from: https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf
+    processor = LlavaNextProcessor.from_pretrained(model_path)
+    model = LlavaNextForConditionalGeneration.from_pretrained(model_path, 
+        torch_dtype=torch.float16, low_cpu_mem_usage=True).to('cuda:0').eval()
+    if 'llava-v1.6-mistral-7b-hf' in model_path:
+        max_tokens, prompt_prefix = 1000, "[/INST]"
+    elif 'llava-v1.6-34b-hf' in model_path:
+        max_tokens, prompt_prefix = 100, "<|im_start|> assistant"
+    for k in tqdm(queries):
+        image = Image.open(queries[k]["figure_path"])
+        if 'llava-v1.6-mistral-7b-hf' in model_path:
+            prompt = f"[INST] <image>\n{queries[k]['question']} [/INST]"
+        elif 'llava-v1.6-34b-hf' in model_path:
+            prompt = f"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\n{queries[k]['question']}<|im_end|><|im_start|>assistant\n"
+        try:
+            inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
+            output = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False)
+            response = processor.decode(output[0], skip_special_tokens=True)
+            response = response.split(prompt_prefix)[1].strip() # remove the prompt
+        except:
+            response = "Generation Error"
+        queries[k]['response'] = response
diff --git a/src/generate_lib/mgm.py b/src/generate_lib/mgm.py
new file mode 100644
index 0000000..2cfeb5e
--- /dev/null
+++ b/src/generate_lib/mgm.py
@@ -0,0 +1,112 @@
+# Adapted from https://github.com/dvlab-research/MGM/blob/main/mgm/serve/cli.py
+# This has support for MGM trained on Llama 3 8B and Yi 34B model for the HD version
+
+from mgm.model.builder import load_pretrained_model
+from mgm.utils import disable_torch_init
+from mgm.mm_utils import get_model_name_from_path, tokenizer_image_token, process_images
+from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from mgm.conversation import conv_templates
+
+import os
+import torch
+from tqdm import tqdm
+from PIL import Image
+
+def get_image_input_from_path(image, model, image_processor):
+    image = Image.open(image)
+    if hasattr(model.config, 'image_size_aux'):
+        if not hasattr(image_processor, 'image_size_raw'):
+            image_processor.image_size_raw = image_processor.crop_size.copy()
+        image_processor.crop_size['height'] = model.config.image_size_aux
+        image_processor.crop_size['width'] = model.config.image_size_aux
+        image_processor.size['shortest_edge'] = model.config.image_size_aux
+    
+    image_tensor = process_images([image], image_processor, model.config)[0]
+    
+    image_grid = getattr(model.config, 'image_grid', 1)
+    if hasattr(model.config, 'image_size_aux'):
+        raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                    image_processor.image_size_raw['width'] * image_grid]
+        image_tensor_aux = image_tensor
+        image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                    size=raw_shape, 
+                                                    mode='bilinear', 
+                                                    align_corners=False)[0]
+    else:
+        image_tensor_aux = []
+
+    if image_grid >= 2:            
+        raw_image = image_tensor.reshape(3, 
+                                        image_grid,
+                                        image_processor.image_size_raw['height'],
+                                        image_grid,
+                                        image_processor.image_size_raw['width'])
+        raw_image = raw_image.permute(1, 3, 0, 2, 4)
+        raw_image = raw_image.reshape(-1, 3,
+                                    image_processor.image_size_raw['height'],
+                                    image_processor.image_size_raw['width'])
+        
+        if getattr(model.config, 'image_global', False):
+            global_image = image_tensor
+            if len(global_image.shape) == 3:
+                global_image = global_image[None]
+            global_image = torch.nn.functional.interpolate(global_image, 
+                                                    size=[image_processor.image_size_raw['height'],
+                                                        image_processor.image_size_raw['width']], 
+                                                    mode='bilinear', 
+                                                    align_corners=False)
+            # [image_crops, image_global]
+            raw_image = torch.cat([raw_image, global_image], dim=0)
+        image_tensor = raw_image.contiguous()
+    
+    images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+    images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+    return images, images_aux, 
+
+
+def generate_response(model_path, queries):
+    disable_torch_init()
+    model_path = os.path.expanduser(model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name,
+                                                                           load_8bit=False)
+    for k in tqdm(queries):
+        query = queries[k]['question']
+        if getattr(model.config, 'mm_use_im_start_end', False):
+            query = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + query
+        else:
+            query = DEFAULT_IMAGE_TOKEN + '\n' + query
+        if 'MGM-8B-HD' in model_name:
+            template_name = "llama_3"
+        elif 'MGM-34B-HD' in model_name:
+            template_name = "chatml_direct"
+        else:
+            raise ValueError(f"Unsupported model name: {model_name}")
+        conv = conv_templates[template_name].copy()
+        conv.append_message(conv.roles[0], query)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        try:
+            images, images_aux = get_image_input_from_path(queries[k]["figure_path"], model, image_processor)
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+            terminators = tokenizer.eos_token_id
+            if template_name == "llama_3":
+                terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=images,
+                    images_aux=images_aux,
+                    do_sample=False,
+                    temperature=0.0,
+                    top_p=1.0,
+                    max_new_tokens=1024,
+                    bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                    eos_token_id=terminators,  # End of sequence token
+                    pad_token_id=tokenizer.pad_token_id,  # Pad token
+                    use_cache=True,
+                )
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+            queries[k]['response'] = outputs
+        except:
+            queries[k]['response'] = "Generation Error"
diff --git a/src/generate_lib/minicpm.py b/src/generate_lib/minicpm.py
new file mode 100644
index 0000000..6b859e2
--- /dev/null
+++ b/src/generate_lib/minicpm.py
@@ -0,0 +1,28 @@
+# Adapted from https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5
+# This has support for MiniCPM V2 and V2.5
+
+from transformers import AutoModel, AutoTokenizer
+from tqdm import tqdm
+from PIL import Image
+import torch
+
+def generate_response(model_path, queries):
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
+    model = model.to(device='cuda', dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model.eval()
+
+    for k in tqdm(queries):
+        query = queries[k]['question']
+        image = Image.open(queries[k]["figure_path"]).convert('RGB')
+        msgs = [{'role': 'user', 'content': query}]
+        res, context, _ = model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=tokenizer,
+            sampling=False,
+            temperature=0.0,
+            top_p=1.0,
+        )
+        queries[k]['response'] = res
diff --git a/src/generate_lib/moai.py b/src/generate_lib/moai.py
new file mode 100644
index 0000000..19f2ea3
--- /dev/null
+++ b/src/generate_lib/moai.py
@@ -0,0 +1,38 @@
+# Adapted from https://github.com/ByungKwanLee/MoAI/blob/master/demo.py
+# This has support for the MoAI model
+
+import os
+vlm_codebase = os.environ['VLM_CODEBASE_DIR']
+
+import sys
+sys.path.append(vlm_codebase + '/MoAI')
+sys.path.append(vlm_codebase + '/MoAI/moai/sgg"')
+
+from moai.load_moai import prepare_moai
+from torchvision.transforms import Resize
+from torchvision.transforms.functional import pil_to_tensor
+
+import tqdm
+from PIL import Image
+import torch
+
+def generate_response(model_path, queries):
+    moai_model, moai_processor, seg_model, seg_processor, od_model, od_processor, sgg_model, ocr_model \
+    = prepare_moai(moai_path=model_path, bits=4, grad_ckpt=False, lora=False, dtype='fp16')
+    for k in tqdm(queries):
+        query = queries[k]['question']
+        image = Resize(size=(490, 490), antialias=False)(pil_to_tensor(Image.open(queries[k]["figure_path"])))
+        moai_inputs = moai_model.demo_process(image=image, 
+                                    prompt=query, 
+                                    processor=moai_processor,
+                                    seg_model=seg_model,
+                                    seg_processor=seg_processor,
+                                    od_model=od_model,
+                                    od_processor=od_processor,
+                                    sgg_model=sgg_model,
+                                    ocr_model=ocr_model,
+                                    device='cuda:0')
+        with torch.inference_mode():
+            generate_ids = moai_model.generate(**moai_inputs, do_sample=False, temperature=0.0, top_p=1.0, max_new_tokens=256, use_cache=True)
+            answer = moai_processor.batch_decode(generate_ids, skip_special_tokens=True)[0].split('[U')[0]
+        queries[k]['response'] = answer
diff --git a/src/generate_lib/paligemma.py b/src/generate_lib/paligemma.py
new file mode 100644
index 0000000..50ff0e8
--- /dev/null
+++ b/src/generate_lib/paligemma.py
@@ -0,0 +1,33 @@
+# Adapted from https://huggingface.co/google/paligemma-3b-pt-896
+# This has support for the PaliGemma model
+
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+from PIL import Image
+import torch
+from tqdm import tqdm
+
+def generate_response(queries, model_path=None):
+    model_id = model_path
+    device = "cuda:0"
+    dtype = torch.bfloat16
+
+    model = PaliGemmaForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype=dtype,
+        device_map=device,
+        revision="bfloat16",
+    ).eval()
+    processor = AutoProcessor.from_pretrained(model_id)
+
+    for k in tqdm(queries):
+        image_path = queries[k]['figure_path']
+        prompt = queries[k]['question']
+        image = Image.open(image_path)
+        model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
+        input_len = model_inputs["input_ids"].shape[-1]
+
+        with torch.inference_mode():
+            generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
+            generation = generation[0][input_len:]
+            decoded = processor.decode(generation, skip_special_tokens=True)
+            queries[k]['response'] = decoded
diff --git a/src/generate_lib/phi3.py b/src/generate_lib/phi3.py
new file mode 100644
index 0000000..694e34e
--- /dev/null
+++ b/src/generate_lib/phi3.py
@@ -0,0 +1,30 @@
+# Adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct
+# This has support for the Phi 3 model
+
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from tqdm import tqdm
+
+def generate_response(queries, model_path=None):
+    
+
+    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda", trust_remote_code=True, 
+                                                 torch_dtype="auto", _attn_implementation='flash_attention_2')
+    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+
+    for k in tqdm(queries):
+        query = queries[k]['question']
+        image = queries[k]["figure_path"]
+        image = Image.open(image).convert('RGB')
+        messages = [{'role': 'user', 'content': f"<|image_1|>\n{query}"}]
+        prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
+        generation_args = {
+            "max_new_tokens": 500,
+            "temperature": 0.0,
+            "do_sample": False,
+        }
+        generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        result = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        queries[k]['response'] = result
diff --git a/src/generate_lib/qwen.py b/src/generate_lib/qwen.py
new file mode 100644
index 0000000..1f6a6e3
--- /dev/null
+++ b/src/generate_lib/qwen.py
@@ -0,0 +1,25 @@
+from dashscope import MultiModalConversation
+import dashscope
+
+def get_client_model(model_path, api_key):
+    assert api_key is not None, "API key is required for using Qwen"
+    assert model_path is not None, "Model name is required for using Qwen"
+    model = model_path
+    client = None
+    return client, model
+
+def generate_response(image_path, query, model, media_type="image/jpeg", api_key=None, client=None, random_baseline=False):
+    dashscope.api_key = api_key
+    messages = [{
+        'role': 'user',
+        'content': [
+            {
+                'image': image_path
+            },
+            {
+                'text': query
+            }
+        ]
+    }]
+    response = MultiModalConversation.call(model=model, messages=messages, temperature=0.0, top_p=0.99999, seed=42)
+    return response.output['choices'][0]['message']['content'][0]['text']
diff --git a/src/generate_lib/reka.py b/src/generate_lib/reka.py
new file mode 100644
index 0000000..659d246
--- /dev/null
+++ b/src/generate_lib/reka.py
@@ -0,0 +1,27 @@
+import reka
+import base64
+import os
+
+def get_client_model(model_path, api_key):
+    assert api_key is not None, "API key is required for using Reka"
+    assert model_path is not None, "Model name is required for using Reka"
+    os.environ["REKA_API_KEY"] = api_key
+    model = model_path
+    client = None
+    return client, model
+
+def generate_response(image_path, query, model, media_type="image/jpeg", api_key=None, client=None, random_baseline=False):
+    reka.API_KEY = api_key
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    response = reka.chat(
+        query,
+        media_filename=image_path,
+        model_name=model,
+        request_output_len=1024,
+        temperature=0.0,
+        runtime_top_p=1.0,
+    )
+    return response['text']
diff --git a/src/generate_lib/sphinx2.py b/src/generate_lib/sphinx2.py
new file mode 100644
index 0000000..f69c25c
--- /dev/null
+++ b/src/generate_lib/sphinx2.py
@@ -0,0 +1,14 @@
+# Adapted from https://github.com/Alpha-VLLM/LLaMA2-Accessory/blob/main/SPHINX/README.md#single-gpu-inference
+# This has support for the SPHINX 2 Llama 13B model
+
+from SPHINX import SPHINXModel
+from PIL import Image
+from tqdm import tqdm
+
+def generate_response(model_path, queries):
+    model = SPHINXModel.from_pretrained(pretrained_path=model_path, with_visual=True)
+    for k in tqdm(queries):
+        qas = [[queries[k]['question'], None]]
+        image = Image.open(queries[k]["figure_path"])
+        response = model.generate_response(qas, image, max_gen_len=1024, temperature=0.0, top_p=1, seed=42)
+        queries[k]['response'] = response
diff --git a/src/generate_lib/utils.py b/src/generate_lib/utils.py
new file mode 100644
index 0000000..baaa398
--- /dev/null
+++ b/src/generate_lib/utils.py
@@ -0,0 +1,136 @@
+import time
+from tqdm import tqdm
+
+def generate_response_remote_wrapper(generate_fn, 
+        queries, model_path, api_key, client, init_sleep=1, 
+        max_retries=10, sleep_factor=1.6):
+    for k in tqdm(queries):
+        sleep_time = init_sleep
+        query = queries[k]['question']
+        image = queries[k]["figure_path"]
+        curr_retries = 0
+        result = None
+        while curr_retries < max_retries and result is None:
+            try:
+                result = generate_fn(image, query, model_path, 
+                    api_key=api_key, client=client, random_baseline=False)
+            except:
+                print(f"Error {curr_retries}, sleeping for {sleep_time} seconds...")
+                time.sleep(sleep_time)
+                curr_retries += 1
+                sleep_time *= sleep_factor
+        if result is None:
+            result = "Error in generating response."
+            print(f"Error in generating response for {k}")
+        queries[k]['response'] = result
+
+def get_client_fn(model_path):
+    if model_path in ['claude-3-sonnet-20240229', 
+                      'claude-3-opus-20240229', 
+                      'claude-3-haiku-20240307', 
+                      'claude-3-5-sonnet-20240620']:
+        from .claude import get_client_model
+    # gemini
+    elif model_path in ['gemini-1.5-pro-001', 
+                        'gemini-1.0-pro-vision-001', 
+                        'gemini-1.5-flash-001']:
+        from .gemini import get_client_model
+    # gpt
+    elif model_path in ['gpt-4o-2024-05-13', 
+                        'gpt-4-turbo-2024-04-09', 
+                        'gpt-4o-mini-2024-07-18']:
+        from .gpt import get_client_model
+    # reka
+    elif model_path in ['reka-core-20240415', 
+                        'reka-flash-20240226', 
+                        'reka-core-20240415']:
+        from .reka import get_client_model
+    # qwen
+    elif model_path in ['qwen-vl-max', 
+                        'qwen-vl-plus']:
+        from .qwen import get_client_model
+    else:
+        raise ValueError(f"Model {model_path} not supported")
+    return get_client_model
+
+def get_generate_fn(model_path):
+    model_name = model_path.split('/')[-1]
+    # cambrian
+    if model_name in ['cambrian-34b']:
+        from .cambrian import generate_response
+    # chartgemma
+    elif model_name in ['chartgemma']:
+        from .chartgemma import generate_response
+    # claude
+    elif model_name in ['claude-3-sonnet-20240229',
+                        'claude-3-opus-20240229',
+                        'claude-3-haiku-20240307',
+                        'claude-3-5-sonnet-20240620']:
+        from .claude import generate_response
+    # deepseekvl
+    elif model_name in ['deepseek-vl-7b-chat']:
+        from .deepseekvl import generate_response
+    # gemini
+    elif model_name in ['gemini-1.5-pro-001', 
+                        'gemini-1.0-pro-vision-001', 
+                        'gemini-1.5-flash-001']:
+        from .gemini import generate_response
+    # gpt
+    elif model_name in ['gpt-4o-2024-05-13', 
+                        'gpt-4-turbo-2024-04-09', 
+                        'gpt-4o-mini-2024-07-18']:
+        from .gpt import generate_response
+    # idefics2
+    elif model_name in ['idefics2-8b',
+                        'idefics2-8b-chatty']:
+        from .idefics2 import generate_response
+    # ixc2
+    elif model_name in ['internlm-xcomposer2-4khd-7b',
+                        'internlm-xcomposer2-vl-7b']:
+        from .ixc2 import generate_response
+    # internvl2
+    elif model_name in ['InternVL2-26B',
+                        'InternVL2-Llama3-76B']:
+        from .internvl2 import generate_response
+    # internvl15
+    elif model_name in ['InternVL-Chat-V1-5']:
+        from .internvl15 import generate_response
+    # llava16
+    elif model_name in ['llava-v1.6-34b-hf',
+                        'llava-v1.6-mistral-7b-hf']:
+        from .llava16 import generate_response
+    # mgm
+    elif model_name in ['MGM-34B-HD',
+                        'MGM-8B-HD']:
+        from .mgm import generate_response
+    # minicpm
+    elif model_name in ['MiniCPM-Llama3-V-2_5',
+                        'MiniCPM-V-2']:
+        from .minicpm import generate_response
+    # moai
+    elif model_name in ['MoAI-7B']:
+        from .moai import generate_response
+    # paligemma
+    elif model_name in ['paligemma-3b-mix-448']:
+        from .paligemma import generate_response
+    # phi3
+    elif model_name in ['Phi-3-vision-128k-instruct']:
+        from .phi3 import generate_response
+    # qwen
+    elif model_name in ['qwen-vl-max',
+                        'qwen-vl-plus']:
+        from .qwen import generate_response
+    # reka
+    elif model_name in ['reka-core-20240415',
+                        'reka-flash-20240226',
+                        'reka-core-20240415']:
+        from .reka import generate_response
+    # sphinx
+    elif model_name in ['SPHINX-v2-1k']:
+        from .sphinx2 import generate_response
+    # vila
+    elif model_name in ['VILA1.5-40b']:
+        from .vila15 import generate_response
+    else:
+        raise ValueError(f"Model {model_name} not supported")
+    return generate_response
diff --git a/src/generate_lib/vila15.py b/src/generate_lib/vila15.py
new file mode 100644
index 0000000..0badfa0
--- /dev/null
+++ b/src/generate_lib/vila15.py
@@ -0,0 +1,80 @@
+# Adapted from https://github.com/NVlabs/VILA/blob/main/predict.py
+# This has support for the VILA 40B model
+
+import os
+vlm_codebase = os.environ['VLM_CODEBASE_DIR']
+
+import sys
+sys.path.append(vlm_codebase + '/VILA')
+
+from io import BytesIO
+import requests
+import torch
+from PIL import Image
+
+from llava.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                            DEFAULT_IMAGE_TOKEN, IMAGE_PLACEHOLDER,
+                            IMAGE_TOKEN_INDEX)
+from llava.conversation import SeparatorStyle, conv_templates
+from llava.mm_utils import (KeywordsStoppingCriteria, get_model_name_from_path,
+                            process_images, tokenizer_image_token)
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+
+from tqdm import tqdm
+
+def load_image(image_file):
+        image = Image.open(image_file).convert("RGB")
+        return image
+    
+def load_images(image_files):
+    out = []
+    for image_file in image_files:
+        image = load_image(image_file)
+        out.append(image)
+    return out
+
+def generate_response(queries, model_path=None):
+    disable_torch_init()
+    
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_name, None)
+    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+
+    for k in tqdm(queries):
+        images = load_images([queries[k]['figure_path']])
+        qs = queries[k]['question']
+        if model.config.mm_use_im_start_end:
+            qs = (image_token_se + "\n") * len(images) + qs
+        else:
+            qs = (DEFAULT_IMAGE_TOKEN + "\n") * len(images) + qs
+        conv_mode = "hermes-2"
+        conv = conv_templates[conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        images_tensor = process_images(images, image_processor, model.config).to(model.device, dtype=torch.float16)
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
+
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=[
+                    images_tensor,
+                ],
+                do_sample=False,
+                max_new_tokens=512,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+            )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[: -len(stop_str)]
+        outputs = outputs.strip()
+        queries[k]['response'] = outputs
diff --git a/src/get_score.py b/src/get_score.py
deleted file mode 100644
index 4a96401..0000000
--- a/src/get_score.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import json
-import argparse
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # input/output
-    parser.add_argument('--model_name', type=str, required=True)
-    parser.add_argument('--split', type=str, required=True)
-    parser.add_argument('--mode', type=str, required=True)
-    parser.add_argument('--score_prefix', type=str, default='scores-')
-    args = parser.parse_args()
-
-    file_path = f"results/{args.score_prefix}{args.model_name}-{args.mode}_{args.split}.json"
-    data = json.load(open(file_path))
-    scores = [d['score'] for d in data.values()]
-    scores = format(100 * sum(scores)/len(scores), '.2f')
-    print(f"Split: {args.split}")
-    print(f"Mode: {args.mode}")
-    print(f"Model: {args.model_name}")
-    print(f"Score: {scores}")
diff --git a/src/get_stats.py b/src/get_stats.py
new file mode 100644
index 0000000..7a4e5a1
--- /dev/null
+++ b/src/get_stats.py
@@ -0,0 +1,122 @@
+import json
+import argparse
+import os
+
+from score_utils import DOMAIN2ABBR, NUM2YEAR, QNUM2QTYPE, \
+        NUMSUBPLOTS2SUBPLOTTYPE, D_TEMPLATE, R_TEMPLATE, \
+        IDX2ANSTYPE, IDX2SRC
+
+def get_descriptive_scores(scores, dmeta, rmeta, imeta):
+    stats = D_TEMPLATE()
+    for k, v in descriptive_meta.items():
+        num_subplot = dmeta[k]['num_subplots']
+        subject = imeta[k]['category']
+        year = imeta[k]['year']
+        for i in range(4):
+            subq_key = f"{k}_{i}"
+            score = scores[subq_key]['score']
+            qnum = dmeta[k]['qids'][i]
+
+            if score not in [0, 1]:
+                stats['N_invalid'].append(1)
+                score = 0
+            
+            stats['N_valid'].append(1)
+            stats['Overall Score'].append(score)
+            stats['By Category'][QNUM2QTYPE(qnum)].append(score)
+            stats['By Subject'][DOMAIN2ABBR[subject]].append(score)
+            stats['By Year'][NUM2YEAR[year]].append(score)
+            stats['By Subplot'][NUMSUBPLOTS2SUBPLOTTYPE(num_subplot)].append(score)
+            stats['By Question'][f'Q{qnum}'].append(score)
+    stats['Question Type'] = 'Descriptive'
+    return stats
+
+def get_reasoning_scores(scores, dmeta, rmeta, imeta):
+    stats = R_TEMPLATE()
+    for k, v in reasoning_meta.items():
+        num_subplot = dmeta[k]['num_subplots']
+        subject = imeta[k]['category']
+        year = imeta[k]['year']
+        answer_type = rmeta[k]['inst_category']
+        source = rmeta[k]['qa_source']
+        score = scores[k]['score']
+        if score not in [0, 1]:
+            stats['N_invalid'].append(1)
+            score = 0
+        
+        stats['N_valid'].append(1)
+        stats['Overall Score'].append(score)
+        stats['By Answer Type'][IDX2ANSTYPE[answer_type]].append(score)
+        stats['By Source'][IDX2SRC[source]].append(score)
+        stats['By Subject'][DOMAIN2ABBR[subject]].append(score)
+        stats['By Year'][NUM2YEAR[year]].append(score)
+        stats['By Subplot'][NUMSUBPLOTS2SUBPLOTTYPE(num_subplot)].append(score)
+    stats['Question Type'] = 'Reasoning'
+    return stats
+
+def get_stats(stats):
+    if len(stats['N_valid']) == 0:
+        print("No valid scores")
+        return
+    for k, v in stats.items():
+        # for sub categories
+        if isinstance(v, dict):
+            for k1, v1 in v.items():
+                if len(v1) == 0:
+                    print(f"{k1}: No valid scores")
+                    stats[k][k1] = 0
+                else:
+                    stats[k][k1] = round(100 * sum(v1)/len(v1), 2)
+        # metadata
+        elif k == 'Question Type':
+            pass
+        # for overall scores
+        elif k not in ['N_valid', 'N_invalid']:
+            if len(v) == 0:
+                print(f"{k}: No valid scores")
+                stats[k] = 0
+            else:
+                stats[k] = round(100 * sum(v)/len(v), 2)
+        # for number of valid/invalid scores
+        else:
+            stats[k] = len(v)
+    return stats
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # input/output
+    parser.add_argument('--model_name', type=str, required=True)
+    parser.add_argument('--split', type=str, required=True)
+    parser.add_argument('--score_prefix', type=str, default='scores-')
+    parser.add_argument('--stats_prefix', type=str, default='stats-')
+    args = parser.parse_args()
+
+    descriptive_score_path = f"full_results/{args.score_prefix}{args.model_name}-descriptive_{args.split}.json"
+    reasoning_score_path = f"full_results/{args.score_prefix}{args.model_name}-reasoning_{args.split}.json"
+
+    image_meta = json.load(open(f"data/image_metadata_{args.split}.json"))
+    descriptive_meta = json.load(open(f"data/descriptive_{args.split}.json"))
+    reasoning_meta = json.load(open(f"data/reasoning_{args.split}.json"))
+
+
+    if os.path.exists(reasoning_score_path):
+        reasoning_scores = json.load(open(reasoning_score_path))
+        reasoning_stats = get_reasoning_scores(reasoning_scores, descriptive_meta, 
+                                               reasoning_meta, image_meta)
+        reasoning_stats = get_stats(reasoning_stats)
+        json.dump(reasoning_stats, open(f"full_results/{args.stats_prefix}{args.model_name}-reasoning_{args.split}.json", "w"), indent=4)
+        print("### Reasoning Stats ###")
+        print(json.dumps(reasoning_stats, indent=4))
+
+    if os.path.exists(descriptive_score_path):
+        descriptive_scores = json.load(open(descriptive_score_path))
+        descriptive_stats = get_descriptive_scores(descriptive_scores, descriptive_meta, 
+                                                   reasoning_meta, image_meta)
+        descriptive_stats = get_stats(descriptive_stats)
+
+        json.dump(descriptive_stats, open(f"full_results/{args.stats_prefix}{args.model_name}-descriptive_{args.split}.json", "w"), indent=4)
+        print("### Descriptive Stats ###")
+        print(json.dumps(descriptive_stats, indent=4))
+    
+    print("Stats saved to results folder")
+
diff --git a/src/score_utils.py b/src/score_utils.py
new file mode 100644
index 0000000..07aeb9c
--- /dev/null
+++ b/src/score_utils.py
@@ -0,0 +1,161 @@
+DOMAIN2ABBR = {
+    'cs': 'Computer Science',
+    'econ': 'Economics',
+    'eess': 'Electrical Engineering and Systems Science',
+    'math': 'Mathematics',
+    'physics': 'Physics',
+    'q-bio': 'Quantitative Biology',
+    'q-fin': 'Quantitative Finance',
+    'stat': 'Statistics'
+}
+
+NUM2YEAR = {
+    '20': '2020',
+    '21': '2021',
+    '22': '2022',
+    '23': '2023'
+}
+
+def QNUM2QTYPE(qnum):
+    if qnum in [1,2,3,4,5,6,7]:
+        return 'Information Extraction'
+    elif qnum in [8,9,13,14,15]:
+        return 'Enumeration'
+    elif qnum in [11,16,18]:
+        return 'Pattern Recognition'
+    elif qnum in [10,12,19]:
+        return 'Counting'
+    elif qnum in [17]:
+        return 'Compositionality'
+    else:
+        raise ValueError(f"Invalid qnum: {qnum}")
+
+def NUMSUBPLOTS2SUBPLOTTYPE(num_subplots):
+    if num_subplots == 1:
+        return '1 Subplot'
+    elif 2 <= num_subplots <= 4:
+        return '2-4 Subplots'
+    elif num_subplots >= 5:
+        return '5+ Subplots'
+    else:
+        raise ValueError(f"Invalid num_subplots: {num_subplots}")
+
+IDX2ANSTYPE = {
+    1: 'Text-in-Chart',
+    2: 'Text-in-General',
+    3: 'Number-in-Chart',
+    4: 'Number-in-General'
+}
+
+IDX2SRC = {
+    1: 'GPT-Sourced',
+    2: 'GPT-Inspired',
+    3: 'Completely Human'
+}
+
+def D_TEMPLATE():
+    return {
+        'Overall Score': [],
+
+        'By Question': {
+            'Q1': [],
+            'Q2': [],
+            'Q3': [],
+            'Q4': [],
+            'Q5': [],
+            'Q6': [],
+            'Q7': [],
+            'Q8': [],
+            'Q9': [],
+            'Q10': [],
+            'Q11': [],
+            'Q12': [],
+            'Q13': [],
+            'Q14': [],
+            'Q15': [],
+            'Q16': [],
+            'Q17': [],
+            'Q18': [],
+            'Q19': [],
+        },
+
+        'By Category': {
+            'Information Extraction': [],
+            'Enumeration': [],
+            'Pattern Recognition': [],
+            'Counting': [],
+            'Compositionality': [],
+        },
+
+        'By Subplot': {
+            '1 Subplot': [], 
+            '2-4 Subplots': [], 
+            '5+ Subplots': [], 
+        },
+
+        'By Subject': {
+            'Computer Science': [],
+            'Economics': [],
+            'Electrical Engineering and Systems Science': [],
+            'Mathematics': [],
+            'Physics': [],
+            'Quantitative Biology': [],
+            'Quantitative Finance': [],
+            'Statistics': [],
+        },
+
+        'By Year': {
+            '2020': [],
+            '2021': [],
+            '2022': [],
+            '2023': [],
+        },
+        
+        'N_valid': [],
+        'N_invalid': []
+    }
+
+def R_TEMPLATE():
+    return {
+    'Overall Score': [],
+
+    'By Answer Type': {
+        'Text-in-Chart': [],
+        'Text-in-General': [],
+        'Number-in-Chart': [],
+        'Number-in-General': [],
+    },
+
+    'By Source': {
+        'GPT-Sourced': [],
+        'GPT-Inspired': [],
+        'Completely Human': [],
+    },
+
+    'By Subject': {
+        'Computer Science': [],
+        'Economics': [],
+        'Electrical Engineering and Systems Science': [],
+        'Mathematics': [],
+        'Physics': [],
+        'Quantitative Biology': [],
+        'Quantitative Finance': [],
+        'Statistics': [],
+    },
+
+    'By Year': {
+        '2020': [],
+        '2021': [],
+        '2022': [],
+        '2023': [],
+    },
+
+    'By Subplot': {
+        '1 Subplot': [], 
+        '2-4 Subplots': [], 
+        '5+ Subplots': [], 
+    },
+    
+    'N_valid': [], 
+    'N_invalid': [] 
+}