-
Notifications
You must be signed in to change notification settings - Fork 24
/
inference_image.py
117 lines (106 loc) · 5.02 KB
/
inference_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import torch
import os
from PIL import Image
from vitron.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, OBJS_TOKEN_INDEX
from vitron.conversation import conv_templates, SeparatorStyle
from vitron.model.builder import load_pretrained_model
from vitron.utils import disable_torch_init
from vitron.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, tokenizer_image_region_token, preprocess_region, show_image_with_bboxes
def inference_image():
disable_torch_init()
image = 'examples/extreme_ironing.jpg'
inp = 'Could you help me transform the image into a video?'
model_base = 'checkpoints/Vitron-base'
cache_dir = 'cache_dir'
device = 'cuda'
load_4bit, load_8bit = False, False
model_path = 'checkpoints/Vitron-lora/'
tokenizer, model, processor, _ = load_pretrained_model(model_path, model_base, 'vitron-llava-7b-lora-4', load_8bit, load_4bit, device=device, cache_dir=cache_dir)
image_processor = processor['image']
conv_mode = "llava_v1"
conv = conv_templates[conv_mode].copy()
roles = conv.roles
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values']
if type(image_tensor) is list:
tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
else:
tensor = image_tensor.to(model.device, dtype=torch.float16)
# preprocess region
ori_im_size = [Image.open(image).convert('RGB').width, Image.open(image).convert('RGB').height]
print('ori_im_size: ', ori_im_size) # [570, 380]
bbox = [0,100,300,200]
region = [preprocess_region(bbox, ori_im_size, [224, 224])]
print('image_tensor: ', image_tensor)
show_image_with_bboxes(image_path=image, bboxes=[bbox], save_path=os.path.join('./', 'ann_1.jpg'))
show_image_with_bboxes(image_path=image_tensor[0], bboxes=region, save_path=os.path.join('./', 'ann_2.jpg'))
print(f"{roles[1]}: {inp}")
print('model device', model.device)
inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
print('prompt: ', prompt)
# input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
input_ids = tokenizer_image_region_token(prompt, tokenizer, OBJS_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
print('input_ids: ', input_ids)
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=tensor,
# regions = region,
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
print(outputs)
def inference_video():
disable_torch_init()
video = 'examples/sample_demo_1.mp4'
inp = 'Why is this video funny?'
# model_path = 'LanguageBind/Video-LLaVA-7B'
model_base = 'checkpoints/Vitron-base'
cache_dir = 'cache_dir'
device = 'cuda'
load_4bit, load_8bit = False, False
model_path = 'checkpoints/Vitron-lora/'
tokenizer, model, processor, _ = load_pretrained_model(model_path, model_base, 'vitron-llava-7b-lora-4', load_8bit, load_4bit, device=device, cache_dir=cache_dir)
video_processor = processor['video']
conv_mode = "llava_v1"
conv = conv_templates[conv_mode].copy()
roles = conv.roles
video_tensor = video_processor(video, return_tensors='pt')['pixel_values']
if type(video_tensor) is list:
tensor = [video.to(model.device, dtype=torch.float16) for video in video_tensor]
else:
tensor = video_tensor.to(model.device, dtype=torch.float16)
print(f"{roles[1]}: {inp}")
inp = ' '.join([DEFAULT_IMAGE_TOKEN] * model.get_video_tower().config.num_frames) + '\n' + inp
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
bbox = [0,100,300,200]
region = [preprocess_region(bbox, (480, 600), [224, 224])]
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=tensor,
regions = region,
do_sample=True,
temperature=0.1,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
print(outputs)
if __name__ == '__main__':
inference_image()
# inference_video()