File size: 2,664 Bytes
021fd45
704bddb
5dd0627
6d16e6e
a5e0173
5dd0627
 
 
 
 
 
a5e0173
5dd0627
 
 
 
 
 
 
 
 
 
 
 
a5e0173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dd0627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5e0173
5dd0627
 
 
 
a5e0173
5dd0627
 
 
 
6d16e6e
5dd0627
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer

# Model setting
model_path = "OpenGVLab/InternVideo2_5_Chat_8B"

# Load the tokenizer and model with remote code enabled.
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()

# Get the image processor from the vision tower.
image_processor = model.get_vision_tower().image_processor

# Evaluation settings
max_num_frames = 512
generation_config = {
    "do_sample": False,
    "temperature": 0.0,
    "max_new_tokens": 1024,
    "top_p": 0.1,
    "num_beams": 1,
}

video_path = "your_video.mp4"  # (For testing locally, update as needed)

# Single-turn conversation example:
def single_turn_chat(video_path, user_prompt):
    output, chat_history = model.chat(
        video_path=video_path,
        tokenizer=tokenizer,
        user_prompt=user_prompt,
        return_history=True,
        max_num_frames=max_num_frames,
        generation_config=generation_config
    )
    return output

# Multi-turn conversation example:
def multi_turn_chat(video_path, user_prompt, chat_history):
    output, chat_history = model.chat(
        video_path=video_path,
        tokenizer=tokenizer,
        user_prompt=user_prompt,
        chat_history=chat_history,
        return_history=True,
        max_num_frames=max_num_frames,
        generation_config=generation_config
    )
    return output, chat_history

# For the Gradio interface, we'll combine these into a chat function.
def chat_interface(video_path, user_prompt, chat_history):
    if chat_history is None:
        chat_history = []
    output, new_history = model.chat(
        video_path=video_path,
        tokenizer=tokenizer,
        user_prompt=user_prompt,
        chat_history=chat_history,
        return_history=True,
        max_num_frames=max_num_frames,
        generation_config=generation_config
    )
    return output, new_history

# Build the Gradio interface.
with gr.Blocks() as demo:
    gr.Markdown("## InternVideo2_5_Chat_8B Chat Interface")
    with gr.Row():
        video_input = gr.Video(label="Upload Video", type="filepath")
        question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...")
    chat_state = gr.State([])  # To maintain conversation history
    output_text = gr.Textbox(label="Model Response")
    
    send_btn = gr.Button("Send")
    send_btn.click(
        chat_interface,
        inputs=[video_input, question_input, chat_state],
        outputs=[output_text, chat_state]
    )

if __name__ == "__main__":
    demo.launch()