From 46b10b106999581d21ca57c98b0d5e456042deb7 Mon Sep 17 00:00:00 2001
From: Cycyes <92714336+Cycyes@users.noreply.github.com>
Date: Wed, 3 Jan 2024 11:25:42 +0800
Subject: [PATCH] update English version of md (#11336)

---
 configs/mm_grounding_dino/dataset_prepare.md  | 1193 +++++++++++++++++
 .../dataset_prepare_zh-CN.md                  |   75 +-
 configs/mm_grounding_dino/usage.md            |  490 +++++++
 tools/dataset_converters/grit_processing.py   |  138 ++
 4 files changed, 1894 insertions(+), 2 deletions(-)
 create mode 100644 configs/mm_grounding_dino/dataset_prepare.md
 create mode 100644 configs/mm_grounding_dino/usage.md
 create mode 100644 tools/dataset_converters/grit_processing.py

diff --git a/configs/mm_grounding_dino/dataset_prepare.md b/configs/mm_grounding_dino/dataset_prepare.md
new file mode 100644
index 00000000000..160c4b98837
--- /dev/null
+++ b/configs/mm_grounding_dino/dataset_prepare.md
@@ -0,0 +1,1193 @@
+# Data Prepare and Process
+
+## MM-GDINO-T Pre-train Dataset
+
+For the MM-GDINO-T model, we provide a total of 5 different data combination pre-training configurations. The data is trained in a progressive accumulation manner, so users can prepare it according to their actual needs.
+
+### 1 Objects365v1
+
+The corresponding training config is [grounding_dino_swin-t_pretrain_obj365](./grounding_dino_swin-t_pretrain_obj365.py)
+
+Objects365v1 can be downloaded from [opendatalab](https://opendatalab.com/OpenDataLab/Objects365_v1). It offers two methods of download: CLI and SDK.
+
+After downloading and unzipping, place the dataset or create a symbolic link to the `data/objects365v1` directory. The directory structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── objects365v1
+│   │   ├── objects365_train.json
+│   │   ├── objects365_val.json
+│   │   ├── train
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+│   │   ├── test
+```
+
+Then, use [coco2odvg.py](../../tools/dataset_converters/coco2odvg.py) to convert it into the ODVG format required for training.
+
+```shell
+python tools/dataset_converters/coco2odvg.py data/objects365v1/objects365_train.json -d o365v1
+```
+
+After the program runs successfully, it will create two new files, `o365v1_train_od.json` and `o365v1_label_map.json`, in the `data/objects365v1` directory. The complete structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── objects365v1
+│   │   ├── objects365_train.json
+│   │   ├── objects365_val.json
+│   │   ├── o365v1_train_od.json
+│   │   ├── o365v1_label_map.json
+│   │   ├── train
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+│   │   ├── test
+```
+
+### 2 COCO 2017
+
+The above configuration will evaluate the performance on the COCO 2017 dataset during the training process. Therefore, it is necessary to prepare the COCO 2017 dataset. You can download it from the [COCO](https://cocodataset.org/) official website or from [opendatalab](https://opendatalab.com/OpenDataLab/COCO_2017).
+
+After downloading and unzipping, place the dataset or create a symbolic link to the `data/coco` directory. The directory structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+### 3 GoldG
+
+After downloading the dataset, you can start training with the [grounding_dino_swin-t_pretrain_obj365_goldg](./grounding_dino_swin-t_pretrain_obj365_goldg.py) configuration.
+
+The GoldG dataset includes the `GQA` and `Flickr30k` datasets, which are part of the MixedGrounding dataset mentioned in the GLIP paper, excluding the COCO dataset. The download links are [mdetr_annotations](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations), and the specific files currently needed are `mdetr_annotations/final_mixed_train_no_coco.json` and `mdetr_annotations/final_flickr_separateGT_train.json`.
+
+Then download the [GQA images](https://nlp.stanford.edu/data/gqa/images.zip). After downloading and unzipping, place the dataset or create a symbolic link to them in the `data/gqa` directory, with the following directory structure:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── gqa
+|   |   ├── final_mixed_train_no_coco.json
+│   │   ├── images
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+Then download the [Flickr30k images](http://shannon.cs.illinois.edu/DenotationGraph/). You need to apply for access to this dataset and then download it using the provided link. After downloading and unzipping, place the dataset or create a symbolic link to them in the `data/flickr30k_entities` directory, with the following directory structure:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── flickr30k_entities
+│   │   ├── final_flickr_separateGT_train.json
+│   │   ├── flickr30k_images
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+For the GQA dataset, you need to use [goldg2odvg.py](../../tools/dataset_converters/goldg2odvg.py) to convert it into the ODVG format required for training:
+
+```shell
+python tools/dataset_converters/goldg2odvg.py data/gqa/final_mixed_train_no_coco.json
+```
+
+After the program has run, a new file `final_mixed_train_no_coco_vg.json` will be created in the `data/gqa` directory, with the complete structure as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── gqa
+|   |   ├── final_mixed_train_no_coco.json
+|   |   ├── final_mixed_train_no_coco_vg.json
+│   │   ├── images
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+For the Flickr30k dataset, you need to use [goldg2odvg.py](../../tools/dataset_converters/goldg2odvg.py) to convert it into the ODVG format required for training:
+
+```shell
+python tools/dataset_converters/goldg2odvg.py data/flickr30k_entities/final_flickr_separateGT_train.json
+```
+
+After the program has run, a new file `final_flickr_separateGT_train_vg.json` will be created in the `data/flickr30k_entities` directory, with the complete structure as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── flickr30k_entities
+│   │   ├── final_flickr_separateGT_train.json
+│   │   ├── final_flickr_separateGT_train_vg.json
+│   │   ├── flickr30k_images
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+### 4 GRIT-20M
+
+The corresponding training configuration is [grounding_dino_swin-t_pretrain_obj365_goldg_grit9m](./grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py).
+
+The GRIT dataset can be downloaded using the img2dataset package from [GRIT](https://huggingface.co/datasets/zzliang/GRIT#download-image). By default, the dataset size is 1.1T, and downloading and processing it may require at least 2T of disk space, depending on your available storage capacity. After downloading, the dataset is in its original format, which includes:
+
+```text
+mmdetection
+├── configs
+├── data
+│    ├── grit_raw
+│    │    ├── 00000_stats.json
+│    │    ├── 00000.parquet
+│    │    ├── 00000.tar
+│    │    ├── 00001_stats.json
+│    │    ├── 00001.parquet
+│    │    ├── 00001.tar
+│    │    ├── ...
+```
+
+After downloading, further format processing is required:
+
+```shell
+python tools/dataset_converters/grit_processing.py data/grit_raw data/grit_processed
+```
+
+The processed format is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│    ├── grit_processed
+│    │    ├── annotations
+│    │    │   ├── 00000.json
+│    │    │   ├── 00001.json
+│    │    │   ├── ...
+│    │    ├── images
+│    │    │   ├── 00000
+│    │    │   │   ├── 000000000.jpg
+│    │    │   │   ├── 000000003.jpg
+│    │    │   │   ├── 000000004.jpg
+│    │    │   │   ├── ...
+│    │    │   ├── 00001
+│    │    │   ├── ...
+```
+
+As for the GRIT dataset, you need to use [grit2odvg.py](../../tools/dataset_converters/grit2odvg.py) to convert it to the format of ODVG:
+
+```python
+python tools/dataset_converters/grit2odvg.py data/grit_processed/
+```
+
+After the program has run, a new file `grit20m_vg.json` will be created in the `data/grit_processed` directory, which has about 9M data, with the complete structure as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│    ├── grit_processed
+|    |    ├── grit20m_vg.json
+│    │    ├── annotations
+│    │    │   ├── 00000.json
+│    │    │   ├── 00001.json
+│    │    │   ├── ...
+│    │    ├── images
+│    │    │   ├── 00000
+│    │    │   │   ├── 000000000.jpg
+│    │    │   │   ├── 000000003.jpg
+│    │    │   │   ├── 000000004.jpg
+│    │    │   │   ├── ...
+│    │    │   ├── 00001
+│    │    │   ├── ...
+```
+
+### 5 V3Det
+
+The corresponding training configurations are:
+
+- [grounding_dino_swin-t_pretrain_obj365_goldg_v3det](./grounding_dino_swin-t_pretrain_obj365_goldg_v3det.py)
+- [grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det](./grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py)
+
+The V3Det dataset can be downloaded from [opendatalab](https://opendatalab.com/V3Det/V3Det). After downloading and unzipping, place the dataset or create a symbolic link to it in the `data/v3det` directory, with the following directory structure:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── v3det
+│   │   ├── annotations
+│   │   |   ├── v3det_2023_v1_train.json
+│   │   ├── images
+│   │   │   ├── a00000066
+│   │   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+Then use [coco2odvg.py](../../tools/dataset_converters/coco2odvg.py) to convert it into the ODVG format required for training:
+
+```shell
+python tools/dataset_converters/coco2odvg.py data/v3det/annotations/v3det_2023_v1_train.json -d v3det
+```
+
+After the program has run, two new files `v3det_2023_v1_train_od.json` and `v3det_2023_v1_label_map.json` will be created in the `data/v3det/annotations` directory, with the complete structure as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── v3det
+│   │   ├── annotations
+│   │   |   ├── v3det_2023_v1_train.json
+│   │   |   ├── v3det_2023_v1_train_od.json
+│   │   |   ├── v3det_2023_v1_label_map.json
+│   │   ├── images
+│   │   │   ├── a00000066
+│   │   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+### 6 Data Splitting and Visualization
+
+Considering that users need to prepare many datasets, which is inconvenient for confirming images and annotations before training, we provide a data splitting and visualization tool. This tool can split the dataset into a tiny version and then use a visualization script to check the correctness of the images and labels.
+
+1. Splitting the Dataset
+
+The script is located [here](../../tools/misc/split_odvg.py). Taking `Object365 v1` as an example, the command to split the dataset is as follows:
+
+```shell
+python tools/misc/split_odvg.py data/object365_v1/ o365v1_train_od.json train your_output_dir --label-map-file o365v1_label_map.json -n 200
+```
+
+After running the above script, it will create a folder structure in the `your_output_dir` directory identical to `data/object365_v1/`, but it will only save 200 training images and their corresponding json files for convenient user review.
+
+2. Visualizing the Original Dataset
+
+The script is located [here](../../tools/analysis_tools/browse_grounding_raw.py). Taking `Object365 v1` as an example, the command to visualize the dataset is as follows:
+
+```shell
+python tools/analysis_tools/browse_grounding_raw.py data/object365_v1/ o365v1_train_od.json train --label-map-file o365v1_label_map.json -o your_output_dir --not-show
+```
+
+After running the above script, it will generate images in the `your_output_dir` directory that include both the pictures and their labels, making it convenient for users to review.
+
+3. Visualizing the Output Dataset
+
+The script is located [here](../../tools/analysis_tools/browse_grounding_dataset.py). Users can use this script to view the results of the dataset output, including the results of data augmentation. Taking `Object365 v1` as an example, the command to visualize the dataset is as follows:
+
+```shell
+python tools/analysis_tools/browse_grounding_dataset.py configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py  -o your_output_dir --not-show
+```
+
+After running the above script, it will generate images in the `your_output_dir` directory that include both the pictures and their labels, making it convenient for users to review.
+
+## MM-GDINO-L Pre-training Data Preparation and Processing
+
+### 1 Object365 v2
+
+Objects365_v2 can be downloaded from [opendatalab](https://opendatalab.com/OpenDataLab/Objects365). It offers two download methods: CLI and SDK.
+
+After downloading and unzipping, place the dataset or create a symbolic link to it in the `data/objects365v2` directory, with the following directory structure:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── objects365v2
+│   │   ├── annotations
+│   │   │   ├── zhiyuan_objv2_train.json
+│   │   ├── train
+│   │   │   ├── patch0
+│   │   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+Since some category names in Objects365v2 are incorrect, it is necessary to correct them first.
+
+```shell
+python tools/dataset_converters/fix_o365_names.py
+```
+
+A new annotation file `zhiyuan_objv2_train_fixname.json` will be generated in the `data/objects365v2/annotations` directory.
+
+Then use [coco2odvg.py](../../tools/dataset_converters/coco2odvg.py) to convert it into the ODVG format required for training:
+
+```shell
+python tools/dataset_converters/coco2odvg.py data/objects365v2/annotations/zhiyuan_objv2_train_fixname.json -d o365v2
+```
+
+After the program has run, two new files `zhiyuan_objv2_train_fixname_od.json` and `o365v2_label_map.json` will be created in the `data/objects365v2` directory, with the complete structure as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── objects365v2
+│   │   ├── annotations
+│   │   │   ├── zhiyuan_objv2_train.json
+│   │   │   ├── zhiyuan_objv2_train_fixname.json
+│   │   │   ├── zhiyuan_objv2_train_fixname_od.json
+│   │   │   ├── o365v2_label_map.json
+│   │   ├── train
+│   │   │   ├── patch0
+│   │   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+### 2 OpenImages v6
+
+OpenImages v6 can be downloaded from the [official website](https://storage.googleapis.com/openimages/web/download_v6.html). Due to the large size of the dataset, it may take some time to download. After completion, the file structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── OpenImages
+│   │   ├── annotations
+|   │   │   ├── oidv6-train-annotations-bbox.csv
+|   │   │   ├── class-descriptions-boxable.csv
+│   │   ├── OpenImages
+│   │   │   ├── train
+│   │   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+Then use [openimages2odvg.py](../../tools/dataset_converters/openimages2odvg.py) to convert it into the ODVG format required for training:
+
+```shell
+python tools/dataset_converters/openimages2odvg.py data/OpenImages/annotations
+```
+
+After the program has run, two new files `oidv6-train-annotation_od.json` and `openimages_label_map.json` will be created in the `data/OpenImages/annotations` directory, with the complete structure as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── OpenImages
+│   │   ├── annotations
+|   │   │   ├── oidv6-train-annotations-bbox.csv
+|   │   │   ├── class-descriptions-boxable.csv
+|   │   │   ├── oidv6-train-annotations_od.json
+|   │   │   ├── openimages_label_map.json
+│   │   ├── OpenImages
+│   │   │   ├── train
+│   │   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+### 3 V3Det
+
+Referring to the data preparation section of the previously mentioned MM-GDINO-T pre-training data preparation and processing, the complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── v3det
+│   │   ├── annotations
+│   │   |   ├── v3det_2023_v1_train.json
+│   │   |   ├── v3det_2023_v1_train_od.json
+│   │   |   ├── v3det_2023_v1_label_map.json
+│   │   ├── images
+│   │   │   ├── a00000066
+│   │   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+### 4 LVIS 1.0
+
+Please refer to the `2 LVIS 1.0` section of the later `Fine-tuning Dataset Preparation`. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── lvis_v1_train.json
+│   │   │   ├── lvis_v1_val.json
+│   │   │   ├── lvis_v1_train_od.json
+│   │   │   ├── lvis_v1_label_map.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── lvis_v1_minival_inserted_image_name.json
+│   │   │   ├── lvis_od_val.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+### 5 COCO2017 OD
+
+You can refer to the earlier section `MM-GDINO-T Pre-training Data Preparation and Processing` for data preparation. For convenience in subsequent processing, please create a symbolic link or move the downloaded [mdetr_annotations](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations) folder to the `data/coco` path. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_refexp_val.json
+│   │   │   ├── finetune_refcoco_testA.json
+│   │   │   ├── ...
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+Due to some overlap between COCO2017 train and RefCOCO/RefCOCO+/RefCOCOg/gRefCOCO val, if not removed in advance, there will be data leakage when evaluating RefExp.
+
+```shell
+python tools/dataset_converters/remove_cocotrain2017_from_refcoco.py data/coco/mdetr_annotations data/coco/annotations/instances_train2017.json
+```
+
+A new file `instances_train2017_norefval.json` will be created in the `data/coco/annotations` directory. Finally, use [coco2odvg.py](../../tools/dataset_converters/coco2odvg.py) to convert it into the ODVG format required for training:
+
+```shell
+python tools/dataset_converters/coco2odvg.py data/coco/annotations/instances_train2017_norefval.json -d coco
+```
+
+Two new files `instances_train2017_norefval_od.json` and `coco_label_map.json` will be created in the `data/coco/annotations` directory, with the complete structure as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── instances_train2017_norefval_od.json
+│   │   │   ├── coco_label_map.json
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_refexp_val.json
+│   │   │   ├── finetune_refcoco_testA.json
+│   │   │   ├── ...
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+Note: There are 15,000 images that overlap between the COCO2017 train and LVIS 1.0 val datasets. Therefore, if the COCO2017 train dataset is used in training, the evaluation results of LVIS 1.0 val will have a data leakage issue. However, LVIS 1.0 minival does not have this problem.
+
+### 6 GoldG
+
+Please refer to the section on `MM-GDINO-T Pre-training Data Preparation and Processing`.
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── flickr30k_entities
+│   │   ├── final_flickr_separateGT_train.json
+│   │   ├── final_flickr_separateGT_train_vg.json
+│   │   ├── flickr30k_images
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   ├── gqa
+|   |   ├── final_mixed_train_no_coco.json
+|   |   ├── final_mixed_train_no_coco_vg.json
+│   │   ├── images
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+### 7 COCO2014 VG
+
+MDetr provides a Phrase Grounding version of the COCO2014 train annotations. The original annotation file is named `final_mixed_train.json`, and similar to the previous structure, the file structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_mixed_train.json
+│   │   │   ├── ...
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── train2014
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+We can extract the COCO portion of the data from `final_mixed_train.json`.
+
+```shell
+python tools/dataset_converters/extract_coco_from_mixed.py data/coco/mdetr_annotations/final_mixed_train.json
+```
+
+A new file named `final_mixed_train_only_coco.json` will be created in the `data/coco/mdetr_annotations` directory. Finally, use [goldg2odvg.py](../../tools/dataset_converters/goldg2odvg.py) to convert it into the ODVG format required for training:
+
+```shell
+python tools/dataset_converters/goldg2odvg.py data/coco/mdetr_annotations/final_mixed_train_only_coco.json
+```
+
+A new file named `final_mixed_train_only_coco_vg.json` will be created in the `data/coco/mdetr_annotations` directory, with the complete structure as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_mixed_train.json
+│   │   │   ├── final_mixed_train_only_coco.json
+│   │   │   ├── final_mixed_train_only_coco_vg.json
+│   │   │   ├── ...
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── train2014
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+Note: COCO2014 train and COCO2017 val do not have duplicate images, so there is no need to worry about data leakage issues in COCO evaluation.
+
+### 8 Referring Expression Comprehension
+
+There are a total of 4 datasets included. For data preparation, please refer to the `Fine-tuning Dataset Preparation` section.
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── instances_train2014.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+│   │   ├── train2014
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_refexp_val.json
+│   │   │   ├── finetune_refcoco_testA.json
+│   │   │   ├── finetune_refcoco_testB.json
+│   │   │   ├── finetune_refcoco+_testA.json
+│   │   │   ├── finetune_refcoco+_testB.json
+│   │   │   ├── finetune_refcocog_test.json
+│   │   │   ├── finetune_refcoco_train_vg.json
+│   │   │   ├── finetune_refcoco+_train_vg.json
+│   │   │   ├── finetune_refcocog_train_vg.json
+│   │   │   ├── finetune_grefcoco_train_vg.json
+```
+
+### 9 GRIT-20M
+
+Please refer to the `MM-GDINO-T Pre-training Data Preparation and Processing` section.
+
+## Preparation of Evaluation Dataset
+
+### 1 COCO 2017
+
+The data preparation process is consistent with the previous descriptions, and the final structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+### 2 LVIS 1.0
+
+The LVIS 1.0 val dataset includes both mini and full versions. The significance of the mini version is:
+
+1. The full LVIS val evaluation dataset is quite large, and conducting an evaluation with it can take a significant amount of time.
+2. In the full LVIS val dataset, there are 15,000 images from the COCO2017 train dataset. If a user has used the COCO2017 data for training, there can be a data leakage issue when evaluating on the full LVIS val dataset
+
+The LVIS 1.0 dataset contains images that are exactly the same as the COCO2017 dataset, with the addition of new annotations. You can download the minival annotation file from [here](https://huggingface.co/GLIPModel/GLIP/blob/main/lvis_v1_minival_inserted_image_name.json), and the val 1.0 annotation file from [here](https://huggingface.co/GLIPModel/GLIP/blob/main/lvis_od_val.json). The final structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── lvis_v1_minival_inserted_image_name.json
+│   │   │   ├── lvis_od_val.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+### 3 ODinW
+
+ODinW, which stands for Object Detection in the Wild, is a dataset used to evaluate the generalization capability of grounding pre-trained models in different real-world scenarios. It consists of two subsets, ODinW13 and ODinW35, representing datasets composed of 13 and 35 different datasets, respectively. You can download it from [here](https://huggingface.co/GLIPModel/GLIP/tree/main/odinw_35), and then unzip each file. The final structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── odinw
+│   │   ├── AerialMaritimeDrone
+│   │   |   |── large
+│   │   |   |   ├── test
+│   │   |   |   ├── train
+│   │   |   |   ├── valid
+│   │   |   |── tiled
+│   │   ├── AmericanSignLanguageLetters
+│   │   ├── Aquarium
+│   │   ├── BCCD
+│   │   ├── ...
+```
+
+When evaluating ODinW35, custom prompts are required. Therefore, it's necessary to preprocess the annotated JSON files in advance. You can use the [override_category.py](./odinw/override_category.py) script for this purpose. After processing, it will generate new annotation files without overwriting the original ones.
+
+```shell
+python configs/mm_grounding_dino/odinw/override_category.py data/odinw/
+```
+
+### 4 DOD
+
+DOD stands for Described Object Detection, and it is introduced in the paper titled [Described Object Detection: Liberating Object Detection with Flexible Expressions](https://arxiv.org/abs/2307.12813). You can download the dataset from [here](https://github.com/shikras/d-cube?tab=readme-ov-file). The final structure of the dataset is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── d3
+│   │   ├── d3_images
+│   │   ├── d3_json
+│   │   ├── d3_pkl
+```
+
+### 5 Flickr30k Entities
+
+In the previous GoldG data preparation section, we downloaded the necessary files for training with Flickr30k. For evaluation, you will need 2 JSON files, which you can download from [here](https://huggingface.co/GLIPModel/GLIP/blob/main/mdetr_annotations/final_flickr_separateGT_val.json) and [here](https://huggingface.co/GLIPModel/GLIP/blob/main/mdetr_annotations/final_flickr_separateGT_test.json). The final structure of the dataset is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── flickr30k_entities
+│   │   ├── final_flickr_separateGT_train.json
+│   │   ├── final_flickr_separateGT_val.json
+│   │   ├── final_flickr_separateGT_test.json
+│   │   ├── final_flickr_separateGT_train_vg.json
+│   │   ├── flickr30k_images
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+```
+
+### 6 Referring Expression Comprehension
+
+Referential Expression Comprehension includes 4 datasets: RefCOCO, RefCOCO+, RefCOCOg, and gRefCOCO. The images used in these 4 datasets are from COCO2014 train, similar to COCO2017. You can download the images from the official COCO website or opendatalab. The annotations can be directly downloaded from [here](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations). The mdetr_annotations folder contains a large number of annotations, so you can choose to download only the JSON files you need. The final structure of the dataset is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── instances_train2014.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+│   │   ├── train2014
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_refexp_val.json
+│   │   │   ├── finetune_refcoco_testA.json
+│   │   │   ├── finetune_refcoco_testB.json
+│   │   │   ├── finetune_refcoco+_testA.json
+│   │   │   ├── finetune_refcoco+_testB.json
+│   │   │   ├── finetune_refcocog_test.json
+│   │   │   ├── finetune_refcocog_test.json
+```
+
+Please note that gRefCOCO is introduced in [GREC: Generalized Referring Expression Comprehension](https://arxiv.org/abs/2308.16182) and is not available in the `mdetr_annotations` folder. You will need to handle it separately. Here are the specific steps:
+
+1. Download [gRefCOCO](https://github.com/henghuiding/gRefCOCO?tab=readme-ov-file) and unzip it into the `data/coco/` folder.
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── instances_train2014.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+│   │   ├── train2014
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── mdetr_annotations
+│   │   ├── grefs
+│   │   │   ├── grefs(unc).json
+│   │   │   ├── instances.json
+```
+
+2. Convert to COCO format
+
+You can use the official [conversion script](https://github.com/henghuiding/gRefCOCO/blob/b4b1e55b4d3a41df26d6b7d843ea011d581127d4/mdetr/scripts/fine-tuning/grefexp_coco_format.py) provided by gRefCOCO. Please note that you need to uncomment line 161 and comment out line 160 in the script to obtain the full JSON file.
+
+```shell
+# you need to clone the official repo
+git clone https://github.com/henghuiding/gRefCOCO.git
+cd gRefCOCO/mdetr
+python scripts/fine-tuning/grefexp_coco_format.py --data_path ../../data/coco/grefs --out_path ../../data/coco/mdetr_annotations/ --coco_path ../../data/coco
+```
+
+Four JSON files will be generated in the `data/coco/mdetr_annotations/` folder. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── instances_train2014.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+│   │   ├── train2014
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_refexp_val.json
+│   │   │   ├── finetune_refcoco_testA.json
+│   │   │   ├── finetune_refcoco_testB.json
+│   │   │   ├── finetune_grefcoco_train.json
+│   │   │   ├── finetune_grefcoco_val.json
+│   │   │   ├── finetune_grefcoco_testA.json
+│   │   │   ├── finetune_grefcoco_testB.json
+```
+
+## Fine-Tuning Dataset Preparation
+
+### 1 COCO 2017
+
+COCO is the most commonly used dataset in the field of object detection, and we aim to explore its fine-tuning modes more comprehensively. From current developments, there are a total of three fine-tuning modes:
+
+1. Closed-set fine-tuning, where the description on the text side cannot be modified after fine-tuning, transforms into a closed-set algorithm. This approach maximizes performance on COCO but loses generality.
+2. Open-set continued pretraining fine-tuning involves using pretraining methods consistent with the COCO dataset. There are two approaches to this: the first is to reduce the learning rate and fix certain modules, fine-tuning only on the COCO dataset; the second is to mix COCO data with some of the pre-trained data. The goal of both approaches is to improve performance on the COCO dataset as much as possible without compromising generalization.
+3. Open-vocabulary fine-tuning involves adopting a common practice in the OVD (Open-Vocabulary Detection) domain. It divides COCO categories into base classes and novel classes. During training, fine-tuning is performed only on the base classes, while evaluation is conducted on both base and novel classes. This approach allows for the assessment of COCO OVD capabilities, with the goal of improving COCO dataset performance without compromising generalization as much as possible.
+
+\*\*(1) Closed-set Fine-tuning \*\*
+
+This section does not require data preparation; you can directly use the data you have prepared previously.
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+**(2) Open-set Continued Pretraining Fine-tuning**
+To use this approach, you need to convert the COCO training data into ODVG format. You can use the following command for conversion:
+
+```shell
+python tools/dataset_converters/coco2odvg.py data/coco/annotations/instances_train2017.json -d coco
+```
+
+This will generate new files, `instances_train2017_od.json` and `coco2017_label_map.json`, in the `data/coco/annotations/` directory. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_train2017_od.json
+│   │   │   ├── coco2017_label_map.json
+│   │   │   ├── instances_val2017.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+Once you have obtained the data, you can choose whether to perform individual pretraining or mixed pretraining.
+
+**(3) Open-vocabulary Fine-tuning**
+For this approach, you need to convert the COCO training data into OVD (Open-Vocabulary Detection) format. You can use the following command for conversion:
+
+```shell
+python tools/dataset_converters/coco2ovd.py data/coco/
+```
+
+This will generate new files, `instances_val2017_all_2.json` and `instances_val2017_seen_2.json`, in the `data/coco/annotations/` directory. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_train2017_od.json
+│   │   │   ├── instances_val2017_all_2.json
+│   │   │   ├── instances_val2017_seen_2.json
+│   │   │   ├── coco2017_label_map.json
+│   │   │   ├── instances_val2017.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+You can then proceed to train and test directly using the [configuration](coco/grounding_dino_swin-t_finetune_16xb4_1x_coco_48_17.py).
+
+### 2 LVIS 1.0
+
+LVIS is a dataset that includes 1,203 classes, making it a valuable dataset for fine-tuning. Due to its large number of classes, it's not feasible to perform closed-set fine-tuning. Therefore, we can only use open-set continued pretraining fine-tuning and open-vocabulary fine-tuning on LVIS.
+
+You need to prepare the LVIS training JSON files first, which you can download from [here](https://www.lvisdataset.org/dataset). We only need `lvis_v1_train.json` and `lvis_v1_val.json`. After downloading them, place them in the `data/coco/annotations/` directory, and then run the following command:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── lvis_v1_train.json
+│   │   │   ├── lvis_v1_val.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── lvis_v1_minival_inserted_image_name.json
+│   │   │   ├── lvis_od_val.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+(1) Open-set continued pretraining fine-tuning
+
+Convert to ODVG format using the following command:
+
+```shell
+python tools/dataset_converters/lvis2odvg.py data/coco/annotations/lvis_v1_train.json
+```
+
+It will generate new files, `lvis_v1_train_od.json` and `lvis_v1_label_map.json`, in the `data/coco/annotations/` directory, and the complete dataset structure will look like this:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── lvis_v1_train.json
+│   │   │   ├── lvis_v1_val.json
+│   │   │   ├── lvis_v1_train_od.json
+│   │   │   ├── lvis_v1_label_map.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── lvis_v1_minival_inserted_image_name.json
+│   │   │   ├── lvis_od_val.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+You can directly use the provided [configuration](lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis.py) for training and testing, or you can modify the configuration to mix it with some of the pretraining datasets as needed.
+
+**(2) Open Vocabulary Fine-tuning**
+
+Convert to OVD format using the following command:
+
+```shell
+python tools/dataset_converters/lvis2ovd.py data/coco/
+```
+
+New `lvis_v1_train_od_norare.json` and `lvis_v1_label_map_norare.json` will be generated under `data/coco/annotations/`, and the complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── lvis_v1_train.json
+│   │   │   ├── lvis_v1_val.json
+│   │   │   ├── lvis_v1_train_od.json
+│   │   │   ├── lvis_v1_label_map.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── lvis_v1_minival_inserted_image_name.json
+│   │   │   ├── lvis_od_val.json
+│   │   │   ├── lvis_v1_train_od_norare.json
+│   │   │   ├── lvis_v1_label_map_norare.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+```
+
+然Then you can directly use the [configuration](lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis_866_337.py) for training and testing.
+
+### 3 RTTS
+
+RTTS is a foggy weather dataset, which contains 4,322 foggy images, including five classes: bicycle, bus, car, motorbike, and person. It can be downloaded from [here](https://drive.google.com/file/d/15Ei1cHGVqR1mXFep43BO7nkHq1IEGh1e/view), and then extracted to the `data/RTTS/` folder. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── RTTS
+│   │   ├── annotations_json
+│   │   ├── annotations_xml
+│   │   ├── ImageSets
+│   │   ├── JPEGImages
+```
+
+### 4 RUOD
+
+RUOD is an underwater object detection dataset. You can download it from [here](https://drive.google.com/file/d/1hxtbdgfVveUm_DJk5QXkNLokSCTa_E5o/view), and then extract it to the `data/RUOD/` folder. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── RUOD
+│   │   ├── Environment_pic
+│   │   ├── Environmet_ANN
+│   │   ├── RUOD_ANN
+│   │   ├── RUOD_pic
+```
+
+### 5 Brain Tumor
+
+Brain Tumor is a 2D detection dataset in the medical field. You can download it from [here](https://universe.roboflow.com/roboflow-100/brain-tumor-m2pbp/dataset/2), please make sure to choose the `COCO JSON` format. Then extract it to the `data/brain_tumor_v2/` folder. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── brain_tumor_v2
+│   │   ├── test
+│   │   ├── train
+│   │   ├── valid
+```
+
+### 6 Cityscapes
+
+Cityscapes is an urban street scene dataset. You can download it from [here](https://www.cityscapes-dataset.com/) or from opendatalab, and then extract it to the `data/cityscapes/` folder. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── cityscapes
+│   │   ├── annotations
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+```
+
+After downloading, you can use the [cityscapes.py](../../tools/dataset_converters/cityscapes.py) script to generate the required JSON format.
+
+```shell
+python tools/dataset_converters/cityscapes.py data/cityscapes/
+```
+
+Three new JSON files will be generated in the annotations directory. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── cityscapes
+│   │   ├── annotations
+│   │   │   ├── instancesonly_filtered_gtFine_train.json
+│   │   │   ├── instancesonly_filtered_gtFine_val.json
+│   │   │   ├── instancesonly_filtered_gtFine_test.json
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+```
+
+### 7 People in Painting
+
+People in Painting is an oil painting dataset that you can download from [here](https://universe.roboflow.com/roboflow-100/people-in-paintings/dataset/2). Please make sure to choose the `COCO JSON` format. After downloading, unzip the dataset to the `data/people_in_painting_v2/` folder. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── people_in_painting_v2
+│   │   ├── test
+│   │   ├── train
+│   │   ├── valid
+```
+
+### 8 Referring Expression Comprehension
+
+Fine-tuning for Referential Expression Comprehension is similar to what was described earlier and includes four datasets. The dataset preparation for evaluation has already been organized. The complete dataset structure is as follows:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── instances_train2014.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+│   │   ├── train2014
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_refexp_val.json
+│   │   │   ├── finetune_refcoco_testA.json
+│   │   │   ├── finetune_refcoco_testB.json
+│   │   │   ├── finetune_refcoco+_testA.json
+│   │   │   ├── finetune_refcoco+_testB.json
+│   │   │   ├── finetune_refcocog_test.json
+│   │   │   ├── finetune_refcocog_test.json
+```
+
+Then we need to convert it to the required ODVG format. Please use the [refcoco2odvg.py](../../tools/dataset_converters/refcoco2odvg.py) script to perform the conversion.
+
+```shell
+python tools/dataset_converters/refcoco2odvg.py data/coco/mdetr_annotations
+```
+
+The converted dataset structure will include 4 new JSON files in the `data/coco/mdetr_annotations` directory. Here is the structure of the converted dataset:
+
+```text
+mmdetection
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── instances_train2017.json
+│   │   │   ├── instances_val2017.json
+│   │   │   ├── instances_train2014.json
+│   │   ├── train2017
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── val2017
+│   │   │   ├── xxxx.jpg
+│   │   │   ├── ...
+│   │   ├── train2014
+│   │   │   ├── xxx.jpg
+│   │   │   ├── ...
+│   │   ├── mdetr_annotations
+│   │   │   ├── final_refexp_val.json
+│   │   │   ├── finetune_refcoco_testA.json
+│   │   │   ├── finetune_refcoco_testB.json
+│   │   │   ├── finetune_refcoco+_testA.json
+│   │   │   ├── finetune_refcoco+_testB.json
+│   │   │   ├── finetune_refcocog_test.json
+│   │   │   ├── finetune_refcoco_train_vg.json
+│   │   │   ├── finetune_refcoco+_train_vg.json
+│   │   │   ├── finetune_refcocog_train_vg.json
+│   │   │   ├── finetune_grefcoco_train_vg.json
+```
diff --git a/configs/mm_grounding_dino/dataset_prepare_zh-CN.md b/configs/mm_grounding_dino/dataset_prepare_zh-CN.md
index 0af692e4ceb..31647e91c5d 100644
--- a/configs/mm_grounding_dino/dataset_prepare_zh-CN.md
+++ b/configs/mm_grounding_dino/dataset_prepare_zh-CN.md
@@ -4,7 +4,7 @@
 
 MM-GDINO-T 模型中我们一共提供了 5 种不同数据组合的预训练配置，数据采用逐步累加的方式进行训练，因此用户可以根据自己的实际需求准备数据。
 
-### 1 Object365 v1
+### 1 Objects365 v1
 
 对应的训练配置为 [grounding_dino_swin-t_pretrain_obj365](./grounding_dino_swin-t_pretrain_obj365.py)
 
@@ -152,6 +152,77 @@ mmdetection
 
 对应的训练配置为 [grounding_dino_swin-t_pretrain_obj365_goldg_grit9m](./grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py)
 
+GRIT数据集可以从 [GRIT](https://huggingface.co/datasets/zzliang/GRIT#download-image) 中使用 img2dataset 包下载，默认指令下载后数据集大小为 1.1T，下载和处理预估需要至少 2T 硬盘空间，可根据硬盘容量酌情下载。下载后原始格式为：
+
+```text
+mmdetection
+├── configs
+├── data
+│    ├── grit_raw
+│    │    ├── 00000_stats.json
+│    │    ├── 00000.parquet
+│    │    ├── 00000.tar
+│    │    ├── 00001_stats.json
+│    │    ├── 00001.parquet
+│    │    ├── 00001.tar
+│    │    ├── ...
+```
+
+下载后需要对格式进行进一步处理:
+
+```shell
+python tools/dataset_converters/grit_processing.py data/grit_raw data/grit_processed
+```
+
+处理后的格式为：
+
+```text
+mmdetection
+├── configs
+├── data
+│    ├── grit_processed
+│    │    ├── annotations
+│    │    │   ├── 00000.json
+│    │    │   ├── 00001.json
+│    │    │   ├── ...
+│    │    ├── images
+│    │    │   ├── 00000
+│    │    │   │   ├── 000000000.jpg
+│    │    │   │   ├── 000000003.jpg
+│    │    │   │   ├── 000000004.jpg
+│    │    │   │   ├── ...
+│    │    │   ├── 00001
+│    │    │   ├── ...
+```
+
+对于 GRIT 数据集，你需要使用 [grit2odvg.py](../../tools/dataset_converters/grit2odvg.py) 转化成需要的 ODVG 格式：
+
+```python
+python tools/dataset_converters/grit2odvg.py data/grit_processed/
+```
+
+程序运行完成后会在 `data/grit_processed` 目录下创建 `grit20m_vg.json` 新文件，大概包含 9M 数据，完整结构如下：
+
+```text
+mmdetection
+├── configs
+├── data
+│    ├── grit_processed
+|    |    ├── grit20m_vg.json
+│    │    ├── annotations
+│    │    │   ├── 00000.json
+│    │    │   ├── 00001.json
+│    │    │   ├── ...
+│    │    ├── images
+│    │    │   ├── 00000
+│    │    │   │   ├── 000000000.jpg
+│    │    │   │   ├── 000000003.jpg
+│    │    │   │   ├── 000000004.jpg
+│    │    │   │   ├── ...
+│    │    │   ├── 00001
+│    │    │   ├── ...
+```
+
 ### 5 V3Det
 
 对应的训练配置为
@@ -347,7 +418,7 @@ mmdetection
 
 ### 4 LVIS 1.0
 
-参加后面的 `微调数据集准备` 的 `2 LVIS 1.0` 部分。完整数据集结构如下：
+参见后面的 `微调数据集准备` 的 `2 LVIS 1.0` 部分。完整数据集结构如下：
 
 ```text
 mmdetection
diff --git a/configs/mm_grounding_dino/usage.md b/configs/mm_grounding_dino/usage.md
new file mode 100644
index 00000000000..f0773c8cf0e
--- /dev/null
+++ b/configs/mm_grounding_dino/usage.md
@@ -0,0 +1,490 @@
+# Usage
+
+## Install
+
+After installing MMDet according to the instructions in the [get_started](../../docs/zh_cn/get_started.md) section, you need to install additional dependency packages:
+
+```shell
+cd $MMDETROOT
+
+pip install -r requirements/multimodal.txt
+pip install emoji ddd-dataset
+pip install git+https://github.com/lvis-dataset/lvis-api.git"
+```
+
+Please note that since the LVIS third-party library does not currently support numpy 1.24, ensure that your numpy version meets the requirements. It is recommended to install numpy version 1.23.
+
+## Instructions
+
+### Download BERT Weight
+
+MM Grounding DINO uses BERT as its language model and requires access to https://huggingface.co/. If you encounter connection errors due to network access issues, you can download the necessary files on a computer with network access and save them locally. Finally, modify the `lang_model_name` field in the configuration file to the local path. For specific instructions, please refer to the following code:
+
+```python
+from transformers import BertConfig, BertModel
+from transformers import AutoTokenizer
+
+config = BertConfig.from_pretrained("bert-base-uncased")
+model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, config=config)
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+config.save_pretrained("your path/bert-base-uncased")
+model.save_pretrained("your path/bert-base-uncased")
+tokenizer.save_pretrained("your path/bert-base-uncased")
+```
+
+### Download NLTK Weight
+
+When MM Grounding DINO performs Phrase Grounding inference, it may extract noun phrases. Although it downloads specific models at runtime, considering that some users' running environments cannot connect to the internet, it is possible to download them in advance to the `~/nltk_data` path.
+
+```python
+import nltk
+nltk.download('punkt', download_dir='~/nltk_data')
+nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data')
+```
+
+### Download MM Grounding DINO-T Weight
+
+For convenience in demonstration, you can download the MM Grounding DINO-T model weights in advance to the current path.
+
+```shell
+wget load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
+```
+
+## Inference
+
+Before inference, for a better experience of the inference effects on different images, it is recommended that you first download [these images](https://github.com/microsoft/X-Decoder/tree/main/inference_demo/images) to the current path.
+
+MM Grounding DINO supports four types of inference methods: Closed-Set Object Detection, Open Vocabulary Object Detection, Phrase Grounding, and Referential Expression Comprehension. The details are explained below.
+
+**(1) Closed-Set Object Detection**
+
+Since MM Grounding DINO is a pretrained model, it can theoretically be applied to any closed-set detection dataset. Currently, we support commonly used datasets such as coco/voc/cityscapes/objects365v1/lvis, etc. Below, we will use coco as an example.
+
+```shell
+python demo/image_demo.py images/animals.png \
+        configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py \
+        --weights grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth \
+        --texts '$: coco'
+```
+
+The predictions for `outputs/vis/animals.png` will be generated in the current directory, as shown in the following image.
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/1659211c-c117-4097-a659-84ab26efa2d3" width="70%"/>
+</div>
+
+Since ostrich is not one of the 80 classes in COCO, it will not be detected.
+
+It's important to note that Objects365v1 and LVIS have a large number of categories. If you try to input all category names directly into the network, it may exceed 256 tokens, leading to poor model predictions. In such cases, you can use the `--chunked-size` parameter to perform chunked predictions. However, please be aware that chunked predictions may take longer to complete due to the large number of categories.
+
+```shell
+python demo/image_demo.py images/animals.png \
+        configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py \
+        --weights grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth \
+        --texts '$: lvis'  --chunked-size 70 \
+        --palette random
+```
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/93554cf5-a1c5-4318-8e16-615cd2270fb6" width="70%"/>
+</div>
+
+Different `--chunked-size` values can lead to different prediction results. You can experiment with different chunked sizes to find the one that works best for your specific task and dataset.
+
+**(2) Open Vocabulary Object Detection**
+
+Open vocabulary object detection refers to the ability to input arbitrary class names during inference.
+
+```shell
+python demo/image_demo.py images/animals.png \
+        configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py \
+        --weights grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth \
+        --texts 'zebra. giraffe' -c
+```
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/75e4a81f-4644-4306-8f66-60e684ac32db" width="70%"/>
+</div>
+
+**(3) Phrase Grounding**
+
+Phrase Grounding refers to the process where a user inputs a natural language description, and the model automatically detects the corresponding bounding boxes for the mentioned noun phrases. It can be used in two ways:
+
+1. Automatically extracting noun phrases using the NLTK library and then performing detection.
+
+```shell
+python demo/image_demo.py images/apples.jpg \
+        configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py \
+        --weights grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth \
+        --texts 'There are many apples here.'
+```
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/7c5839d2-3266-49e1-8be6-012f258d710b" width="70%"/>
+</div>
+
+The program will automatically split `many apples` as a noun phrase and then detect the corresponding objects. Different input descriptions can have a significant impact on the prediction results.
+
+2. Users can manually specify which parts of the sentence are noun phrases to avoid errors in NLTK extraction.
+
+```shell
+python demo/image_demo.py images/fruit.jpg \
+        configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py \
+        --weights grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth \
+        --texts 'The picture contains watermelon, flower, and a white bottle.' \
+        --tokens-positive "[[[21,30]], [[45,59]]]"  --pred-score-thr 0.12
+```
+
+The noun phrase corresponding to positions 21-30 is `watermelon`, and the noun phrase corresponding to positions 45-59 is `a white bottle`.
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/82253bf7-dce8-4057-98a9-77bf850afdd0" width="70%"/>
+</div>
+**(4) Referential Expression Comprehension**
+
+Referential expression understanding refers to the model automatically comprehending the referential expressions involved in a user's language description without the need for noun phrase extraction.
+
+```shell
+python demo/image_demo.py images/apples.jpg \
+        configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py \
+        --weights grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth \
+        --texts 'red apple.' \
+        --tokens-positive -1
+```
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/40b970c3-60cd-4c78-a2cb-2c41b0442932" width="70%"/>
+</div>
+
+## Evaluation
+
+Our provided evaluation scripts are unified, and you only need to prepare the data in advance and then run the relevant configuration.
+
+(1) Zero-Shot COCO2017 val
+
+```shell
+# single GPU
+python tools/test.py configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py \
+        grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth
+
+# 8 GPUs
+./tools/dist_test.sh configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py \
+        grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth 8
+```
+
+(2) Zero-Shot ODinW13
+
+```shell
+# single GPU
+python tools/test.py configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py \
+        grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth
+
+# 8 GPUs
+./tools/dist_test.sh configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py \
+        grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth 8
+```
+
+## Visualization of Evaluation Results
+
+For the convenience of visualizing and analyzing model prediction results, we provide support for visualizing evaluation dataset prediction results. Taking referential expression understanding as an example, the usage is as follows:
+
+```shell
+python tools/test.py configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp \
+        grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth --work-dir refcoco_result --show-dir save_path
+```
+
+During the inference process, it will save the visualization results to the `refcoco_result/{current_timestamp}/save_path` directory. For other evaluation dataset visualizations, you only need to replace the configuration file.
+
+Here are some visualization results for various datasets. The left image represents the Ground Truth (GT). The right image represents the Predicted Result.
+
+1. COCO2017 val Results：
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/3a0fa894-c0a5-4c1f-bdf0-1c6fd17abafa" width="70%"/>
+</div>
+
+2. Flickr30k Entities Results：
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/e9f2667f-9dca-464b-b995-599aa2731b34" width="70%"/>
+</div>
+
+3. DOD Results：
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/c71a306b-1055-4344-ba1d-ae4c57f2cb2f" width="70%"/>
+</div>
+
+4. RefCOCO val Results：
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/b175959d-d788-4b5e-8b11-e8e34753457f" width="70%"/>
+</div>
+
+5. RefCOCO testA Results：
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/c087f889-f96c-4355-8a15-7dc2738b4223" width="70%"/>
+</div>
+
+6. gRefCOCO val Results：
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/96c2e783-17da-462e-a7cf-937555e26c90" width="70%"/>
+</div>
+
+## Training
+
+If you want to reproduce our results, you can train the model by using the following command after preparing the dataset:
+
+```shell
+# Training on a single machine with 8 GPUs for obj365v1 dataset
+./tools/dist_train.sh configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py 8
+# Training on a single machine with 8 GPUs for datasets like obj365v1, goldg, grit, v3det, and other datasets is similar.
+./tools/dist_train.sh configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py 8
+```
+
+For multi-machine training, please refer to [train.md](../../docs/zh_cn/user_guides/train.md). The MM-Grounding-DINO T model is designed to work with 32 GPUs (specifically, 3090Ti GPUs). If your total batch size is not 32x4=128, you will need to manually adjust the learning rate accordingly.
+
+### Pretraining Custom Format Explanation
+
+In order to standardize the pretraining formats for different datasets, we refer to the format design proposed by [Open-GroundingDino](https://github.com/longzw1997/Open-GroundingDino). Specifically, it is divided into two formats.
+
+**(1) Object Detection Format (OD)**
+
+```text
+{"filename": "obj365_train_000000734304.jpg",
+ "height": 512,
+ "width": 769,
+ "detection": {
+    "instances": [
+          {"bbox": [109.4768676992, 346.0190429696, 135.1918335098, 365.3641967616], "label": 2, "category": "chair"},
+          {"bbox": [58.612365705900004, 323.2281494016, 242.6005859067, 451.4166870016], "label": 8, "category": "car"}
+                ]
+      }
+}
+```
+
+The numerical values corresponding to labels in the label dictionary should match the respective label_map. Each item in the instances list corresponds to a bounding box (in the format x1y1x2y2).
+
+**(2) Phrase Grounding Format (VG)**
+
+```text
+{"filename": "2405116.jpg",
+ "height": 375,
+ "width": 500,
+ "grounding":
+     {"caption": "Two surfers walking down the shore. sand on the beach.",
+      "regions": [
+            {"bbox": [206, 156, 282, 248], "phrase": "Two surfers", "tokens_positive": [[0, 3], [4, 11]]},
+            {"bbox": [303, 338, 443, 343], "phrase": "sand", "tokens_positive": [[36, 40]]},
+            {"bbox": [[327, 223, 421, 282], [300, 200, 400, 210]], "phrase": "beach", "tokens_positive": [[48, 53]]}
+               ]
+      }
+```
+
+The `tokens_positive` field indicates the character positions of the current phrase within the caption.
+
+## Example of Fine-tuning Custom Dataset
+
+In order to facilitate downstream fine-tuning on custom datasets, we have provided a fine-tuning example using the simple "cat" dataset as an illustration.
+
+### 1 Data Preparation
+
+```shell
+cd mmdetection
+wget https://download.openmmlab.com/mmyolo/data/cat_dataset.zip
+unzip cat_dataset.zip -d data/cat/
+```
+
+The "cat" dataset is a single-category dataset consisting of 144 images, already converted to the COCO format.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/25873202/205423220-c4b8f2fd-22ba-4937-8e47-1b3f6a8facd8.png" alt="cat dataset"/>
+</div>
+
+### 2 Configuration Preparation
+
+Due to the simplicity and small size of the "cat" dataset, we trained it for 20 epochs using 8 GPUs, with corresponding learning rate scaling. We did not train the language model, only the visual model.
+
+Detailed configuration information can be found in [grounding_dino_swin-t_finetune_8xb4_20e_cat](grounding_dino_swin-t_finetune_8xb4_20e_cat.py).
+
+### 3 Visualization and Evaluation of Zero-Shot Results
+
+Due to MM Grounding DINO being an open-set detection model, you can perform detection and evaluation even if it was not trained on the cat dataset.
+
+Visualization of a single image:
+
+```shell
+cd mmdetection
+python demo/image_demo.py data/cat/images/IMG_20211205_120756.jpg configs/mm_grounding_dino/grounding_dino_swin-t_finetune_8xb4_20e_cat.py --weights grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth --texts cat.
+```
+
+Evaluation results of Zero-shot on test dataset：
+
+```shell
+python tools/test.py configs/mm_grounding_dino/grounding_dino_swin-t_finetune_8xb4_20e_cat.py grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth
+```
+
+```text
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.881
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = 1.000
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = 0.929
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = -1.000
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = -1.000
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.881
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.913
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=300 ] = 0.913
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=1000 ] = 0.913
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = -1.000
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = -1.000
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.913
+```
+
+### 4 Fine-tuning
+
+```shell
+./tools/dist_train.sh configs/mm_grounding_dino/grounding_dino_swin-t_finetune_8xb4_20e_cat.py 8 --work-dir cat_work_dir
+```
+
+The model will save the best-performing checkpoint. It achieved its best performance at the 16th epoch, with the following results:
+
+```text
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.901
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = 1.000
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = 0.930
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = -1.000
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = -1.000
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.901
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.967
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=300 ] = 0.967
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=1000 ] = 0.967
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = -1.000
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = -1.000
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.967
+```
+
+We can observe that after fine-tuning, the training performance on the cat dataset improved from 88.1 to 90.1. However, due to the small dataset size, the evaluation metrics show some fluctuations.
+
+## Iterative Generation and Optimization Pipeline of Model Self-training Pseduo Label
+
+To facilitate users in creating their own datasets from scratch or those who want to leverage the model's inference capabilities for iterative pseudo-label generation and optimization, continuously modifying pseudo-labels to improve model performance, we have provided relevant pipelines.
+
+Since we have defined two data formats, we will provide separate explanations for demonstration purposes.
+
+### 1 Object Detection Format
+
+Here, we continue to use the aforementioned cat dataset as an example. Let's assume that we currently have a series of images and predefined categories but no annotations.
+
+1. Generate initial `odvg` format file
+
+```python
+import os
+import cv2
+import json
+import jsonlines
+
+data_root = 'data/cat'
+images_path = os.path.join(data_root, 'images')
+out_path = os.path.join(data_root, 'cat_train_od.json')
+metas = []
+for files in os.listdir(images_path):
+    img = cv2.imread(os.path.join(images_path, files))
+    height, width, _ = img.shape
+    metas.append({"filename": files, "height": height, "width": width})
+
+with jsonlines.open(out_path, mode='w') as writer:
+    writer.write_all(metas)
+
+# 生成 label_map.json，由于只有一个类别，所以只需要写一个 cat 即可
+label_map_path = os.path.join(data_root, 'cat_label_map.json')
+with open(label_map_path, 'w') as f:
+    json.dump({'0': 'cat'}, f)
+```
+
+Two files, `cat_train_od.json` and `cat_label_map.json`, will be generated in the `data/cat` directory.
+
+2. Inference with pre-trained model and save the results
+
+We provide a readily usable [configuration](grounding_dino_swin-t_pretrain_pseudo-labeling_cat.py). If you are using a different dataset, you can refer to this configuration for modifications.
+
+```shell
+python tools/test.py configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_pseudo-labeling_cat.py \
+    grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth
+```
+
+A new file `cat_train_od_v1.json` will be generated in the `data/cat` directory. You can manually open it to confirm or use the provided [script](../../tools/analysis_tools/browse_grounding_raw.py) to visualize the results.
+
+```shell
+python tools/analysis_tools/browse_grounding_raw.py data/cat/ cat_train_od_v1.json images --label-map-file cat_label_map.json -o your_output_dir --not-show
+```
+
+The visualization results will be generated in the `your_output_dir` directory.
+
+3. Continue training to boost performance
+
+After obtaining pseudo-labels, you can mix them with some pre-training data for further pre-training to improve the model's performance on the current dataset. Then, you can repeat step 2 to obtain more accurate pseudo-labels, and continue this iterative process.
+
+### 2 Phrase Grounding Format
+
+1. Generate initial `odvg` format file
+
+The bootstrapping process of Phrase Grounding requires providing captions corresponding to each image and pre-segmented phrase information initially. Taking flickr30k entities images as an example, the generated typical file should look like this:
+
+```text
+[
+{"filename": "3028766968.jpg",
+ "height": 375,
+ "width": 500,
+ "grounding":
+     {"caption": "Man with a black shirt on sit behind a desk sorting threw a giant stack of people work with a smirk on his face .",
+      "regions": [
+                 {"bbox": [0, 0, 1, 1], "phrase": "a giant stack of people", "tokens_positive": [[58, 81]]},
+                 {"bbox": [0, 0, 1, 1], "phrase": "a black shirt", "tokens_positive": [[9, 22]]},
+                 {"bbox": [0, 0, 1, 1], "phrase": "a desk", "tokens_positive": [[37, 43]]},
+                 {"bbox": [0, 0, 1, 1], "phrase": "his face", "tokens_positive": [[103, 111]]},
+                 {"bbox": [0, 0, 1, 1], "phrase": "Man", "tokens_positive": [[0, 3]]}]}}
+{"filename": "6944134083.jpg",
+ "height": 319,
+ "width": 500,
+ "grounding":
+    {"caption": "Two men are competing in a horse race .",
+    "regions": [
+                {"bbox": [0, 0, 1, 1], "phrase": "Two men", "tokens_positive": [[0, 7]]}]}}
+]
+```
+
+Bbox needs to be set to `[0, 0, 1, 1]` for initialization to make sure the programme could run, but this value would not be utilized.
+
+```text
+{"filename": "3028766968.jpg", "height": 375, "width": 500, "grounding": {"caption": "Man with a black shirt on sit behind a desk sorting threw a giant stack of people work with a smirk on his face .", "regions": [{"bbox": [0, 0, 1, 1], "phrase": "a giant stack of people", "tokens_positive": [[58, 81]]}, {"bbox": [0, 0, 1, 1], "phrase": "a black shirt", "tokens_positive": [[9, 22]]}, {"bbox": [0, 0, 1, 1], "phrase": "a desk", "tokens_positive": [[37, 43]]}, {"bbox": [0, 0, 1, 1], "phrase": "his face", "tokens_positive": [[103, 111]]}, {"bbox": [0, 0, 1, 1], "phrase": "Man", "tokens_positive": [[0, 3]]}]}}
+{"filename": "6944134083.jpg", "height": 319, "width": 500, "grounding": {"caption": "Two men are competing in a horse race .", "regions": [{"bbox": [0, 0, 1, 1], "phrase": "Two men", "tokens_positive": [[0, 7]]}]}}
+```
+
+You can directly copy the text above, and assume that the text content is pasted into a file named `flickr_simple_train_vg.json`, which is placed in the pre-prepared `data/flickr30k_entities` dataset directory, as detailed in the data preparation document.
+
+2. Inference with pre-trained model and save the results
+
+We provide a directly usable [configuration](https://chat.openai.com/c/grounding_dino_swin-t_pretrain_pseudo-labeling_flickr30k.py). If you are using a different dataset, you can refer to this configuration for modifications.
+
+```shell
+python tools/test.py configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_pseudo-labeling_flickr30k.py \
+    grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth
+```
+
+The translation of your text from Chinese to English is: "A new file `flickr_simple_train_vg_v1.json` will be generated in the `data/flickr30k_entities` directory. You can manually open it to confirm or use the [script](../../tools/analysis_tools/browse_grounding_raw.py) to visualize the effects
+
+```shell
+python tools/analysis_tools/browse_grounding_raw.py data/flickr30k_entities/ flickr_simple_train_vg_v1.json flickr30k_images -o your_output_dir --not-show
+```
+
+The visualization results will be generated in the `your_output_dir` directory, as shown in the following image:
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/a1c72d52-fa52-4ebe-b793-716d34e7b83f" width="50%"/>
+</div>
+
+3. Continue training to boost performance
+
+After obtaining the pseudo-labels, you can mix some pre-training data to continue pre-training jointly, which enhances the model's performance on the current dataset. Then, rerun step 2 to obtain more accurate pseudo-labels, and repeat this cycle iteratively.
diff --git a/tools/dataset_converters/grit_processing.py b/tools/dataset_converters/grit_processing.py
new file mode 100644
index 00000000000..923093ab4fc
--- /dev/null
+++ b/tools/dataset_converters/grit_processing.py
@@ -0,0 +1,138 @@
+import argparse
+import json
+import logging
+import os
+import tarfile
+from functools import partial
+from multiprocessing import Pool
+
+
+def create_logger(output_file):
+    logger = logging.getLogger('grit_logger')
+    logger.setLevel(logging.INFO)  # set logger output level
+    formatter = logging.Formatter('%(asctime)s - %(message)s')
+
+    fh = logging.FileHandler(output_file)
+    fh.setLevel(logging.INFO)
+    fh.setFormatter(formatter)
+
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+
+    logger.addHandler(fh)
+    logger.addHandler(console)
+
+    return logger
+
+
+def count_download_image(download_json_dir, logger):
+    parquet_files = [
+        f for f in os.listdir(download_json_dir) if f.endswith('.json')
+    ]
+    len = 0
+
+    for file in parquet_files:
+        with open(os.path.join(download_json_dir, file), 'r') as f:
+            data = json.load(f)
+            len = len + int(data['successes'])
+        logger.info(file + 'has ' + str(data['successes']) +
+                    ' successful images')
+
+    logger.info('all files finished.', str(len),
+                'images have been successfully downloaded.')
+
+
+def tar_processing(tar_path, output_dir, logger):
+    """解压tar文件到对应名字的文件夹，并提取所有的json combine后，删除其他保存图片."""
+    # 创建文件夹并解压
+    filepath = untar(tar_path, logger)
+    '''将所有json融合为一个json'''
+    # 获取解压后目录下所有的.json文件
+    json_files = [f for f in os.listdir(filepath) if f.endswith('.json')]
+    # 初始化一个空的列表来存储所有的数据
+    all_data = []
+    cnt = 0
+
+    for file in json_files:
+        with open(os.path.join(filepath, file), 'r') as f:
+            df = json.load(f)
+        cnt = cnt + 1
+        # 将DataFrame转换为.json格式，并添加到all_data列表中
+        all_data.extend([df])
+    dir_name = os.path.basename(filepath)
+    # write all data to a json file
+    logger.info(f'{dir_name} has {cnt} jsons')
+    json_name = os.path.basename(filepath) + '.json'
+    if not os.path.exists(os.path.join(output_dir, 'annotations')):
+        os.mkdir(os.path.join(output_dir, 'annotations'))
+    with open(os.path.join(output_dir, 'annotations', json_name), 'w') as f:
+        json.dump(all_data, f)
+    logger.info(f'{dir_name} completed')
+    cp_rm(filepath, output_dir)
+    return os.path.basename(filepath)
+
+
+def untar(filepath, logger):
+    # 如果文件是tar文件，就解压它
+    if tarfile.is_tarfile(filepath):
+        # 创建一个新的文件夹，和tar文件同名，但去掉后缀
+        new_folder = os.path.splitext(filepath)[0]
+        tar_name = os.path.basename(filepath)
+        with tarfile.open(filepath) as tar:
+            # 获取tar文件中的所有成员
+            members = tar.getmembers()
+            if not os.path.exists(new_folder):
+                os.mkdir(new_folder)
+            else:
+                f = os.listdir(new_folder)
+                # 打开tar文件，并解压到新的文件夹中
+                if len(members) == len(f):
+                    logger.info(f'{tar_name} already decompressed')
+                    return new_folder
+            logger.info(f'{tar_name} decompressing...')
+            os.system(f'tar -xf {filepath} -C {new_folder}')
+            logger.info(f'{tar_name} decompressed!')
+        return new_folder
+
+
+def cp_rm(filepath, output_dir):
+    # delete txt/json
+    for file in os.listdir(filepath):
+        if file.endswith('.txt') or file.endswith('.json'):
+            os.remove(os.path.join(filepath, file))
+    # move images to output dir
+    target_dir = os.path.join(output_dir, 'images')
+    if not os.path.exists(os.path.join(output_dir, 'images')):
+        os.mkdir(os.path.join(output_dir, 'images'))
+    os.system('mv -f {} {}'.format(filepath, target_dir))
+
+
+parser = argparse.ArgumentParser()
+# parser.add_argument('-d', '--download_json_dir', type=str, default=None)
+parser.add_argument('image_dir', type=str)  # grit raw directory
+parser.add_argument('output_dir', type=str)  # processed grit output dir
+parser.add_argument('--log_name', type=str, default='grit_processing.log')
+
+args = parser.parse_args()
+
+
+def main(args):
+    logger = create_logger(args.log_name)
+    # if args.download_json_dir != None:
+    #     count_download_image(args.download_json_dir, logger)
+    if args.image_dir is not None:
+        all_file_name = [
+            os.path.join(args.image_dir, file)
+            for file in os.listdir(args.image_dir) if file.endswith('.tar')
+        ]
+        all_file_name.sort()
+        func = partial(
+            tar_processing, output_dir=args.output_dir, logger=logger)
+        with Pool(processes=10) as pool:
+            result = pool.imap(func=func, iterable=all_file_name)
+            for r in result:
+                print(result)
+
+
+if __name__ == '__main__':
+    main(args)