Spaces:
Running
on
Zero
Running
on
Zero
import spaces, os | |
import gradio as gr | |
from kokoro import KPipeline | |
from qwen_vl_utils import process_vision_info | |
from demo.infer import LiveCCDemoInfer | |
model_path = 'chenjoya/LiveCC-7B-Instruct' | |
def _init_infer(): | |
# create a singleton LiveCCDemoInfer inside GPU | |
import torch | |
from kokoro import KPipeline | |
from demo.infer import LiveCCDemoInfer | |
infer = LiveCCDemoInfer(model_path, device='cuda') | |
return infer | |
# We'll keep a module-global placeholder | |
infer = None | |
with gr.Blocks() as demo: | |
gr.Markdown("## LiveCC Conversation and Real-Time Commentary - Gradio Demo") | |
gr.Markdown("### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale (CVPR 2025)](https://showlab.github.io/livecc/)") | |
gr.Markdown("1️⃣ Select Mode, Real-Time Commentary (LiveCC) or Conversation (Common QA)") | |
gr.Markdown("2️⃣🅰️ Real-Time Commentary: Input a query (optional) -> Click or upload a video.") | |
gr.Markdown("2️⃣🅱️ Conversation: Click or upload a video -> Input a query.") | |
gr.Markdown("*Web Gradio has unexpected latency (3s~5s). If you want to enjoy the very real-time experience, please deploy locally https://github.com/showlab/livecc*") | |
gr_state = gr.State({}, render=False) # control all useful state, including kv cache | |
gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight | |
gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop | |
gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh | |
with gr.Row(): | |
with gr.Column(): | |
gr_video = gr.Video( | |
label="video", | |
elem_id="gr_video", | |
visible=True, | |
sources=['upload'], | |
autoplay=True, | |
include_audio=False, | |
width=720, | |
height=480 | |
) | |
gr_examples = gr.Examples( | |
examples=[ | |
'demo/sources/howto_fix_laptop_mute_1080p.mp4', | |
'demo/sources/writing_mute_1080p.mp4' | |
], | |
inputs=[gr_video], | |
) | |
gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button") | |
with gr.Column(): | |
with gr.Row(): | |
gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True) | |
def gr_chatinterface_fn(message, history, state, video_path, mode): | |
global infer | |
yield '(initializing model, thanks for waiting...)', state | |
if infer is None: | |
infer = _init_infer() | |
state['video_path'] = video_path | |
yield '(finished initialization, responding...)', state | |
if mode != 'Conversation': | |
yield 'waiting video input...', state | |
query = message | |
if video_path: | |
message = { | |
"role": "user", | |
"content": [ | |
{"type": "video", "video": video_path}, | |
{"type": "text", "text": query}, | |
], | |
} | |
else: | |
message = { | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": query}, | |
], | |
} | |
image_inputs, video_inputs = process_vision_info([message]) | |
texts = infer.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True, return_tensors='pt') | |
past_ids = state.get('past_ids', None) | |
if past_ids is not None: | |
texts = '<|im_end|>\n' + texts[infer.system_prompt_offset:] | |
inputs = infer.processor( | |
text=texts, | |
images=image_inputs, | |
videos=video_inputs, | |
return_tensors="pt", | |
) | |
inputs.to(infer.model.device) | |
if past_ids is not None: | |
inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1) | |
outputs = infer.model.generate( | |
**inputs, past_key_values=state.get('past_key_values', None), | |
return_dict_in_generate=True, do_sample=False, | |
repetition_penalty=1.05, | |
max_new_tokens=512, | |
) | |
state['past_key_values'] = outputs.past_key_values | |
state['past_ids'] = outputs.sequences[:, :-1] | |
response = infer.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True) | |
print(response) | |
yield response, {} | |
def gr_chatinterface_chatbot_clear_fn(): | |
return {}, {}, 0, 0 | |
gr_chatinterface = gr.ChatInterface( | |
fn=gr_chatinterface_fn, | |
type="messages", | |
additional_inputs=[gr_state, gr_video, gr_radio_mode], | |
additional_outputs=[gr_state], | |
) | |
gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger]) | |
gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger]) | |
def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int): | |
# if static_trigger == 0: | |
# return gr_chatinterface_chatbot_clear_fn() | |
# if video_state['video_path'] != state.get('video_path', None): | |
# return gr_chatinterface_chatbot_clear_fn() | |
state.update(video_state) | |
query, assistant_waiting_message = None, None | |
for message in history[::-1]: | |
if message['role'] == 'user': | |
if message['metadata'] is None or message['metadata'].get('status', '') == '': | |
query = message['content'] | |
if message['metadata'] is None: | |
message['metadata'] = {} | |
message['metadata']['status'] = 'pending' | |
continue | |
if query is not None: # put others as done | |
message['metadata']['status'] = 'done' | |
elif message['content'] == GradioBackend.waiting_video_response: | |
assistant_waiting_message = message | |
for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode): | |
if start_timestamp >= 0: | |
response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}' | |
if assistant_waiting_message is None: | |
history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp)) | |
else: | |
assistant_waiting_message['content'] = response_with_timestamp | |
assistant_waiting_message = None | |
yield history, state, dynamic_trigger | |
yield history, state, 1 - dynamic_trigger | |
js_video_timestamp_fetcher = """ | |
(state, video_state) => { | |
const videoEl = document.querySelector("#gr_video video"); | |
return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime }; | |
} | |
""" | |
def gr_get_video_state(video_state): | |
if 'file=' in video_state['video_path']: | |
video_state['video_path'] = video_state['video_path'].split('file=')[1] | |
return video_state | |
def gr_video_change_fn(mode): | |
return [1, 1] if mode == "Real-Time Commentary" else [0, 0] | |
gr_video.change( | |
fn=gr_video_change_fn, | |
inputs=[gr_radio_mode], | |
outputs=[gr_static_trigger, gr_dynamic_trigger] | |
) | |
gr_dynamic_trigger.change( | |
fn=gr_get_video_state, | |
inputs=[gr_video_state], | |
outputs=[gr_video_state], | |
js=js_video_timestamp_fetcher | |
).then( | |
fn=gr_for_streaming, | |
inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger], | |
outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger], | |
) | |
demo.queue(max_size=5, default_concurrency_limit=5) | |
demo.launch(share=True) | |
# --- for streaming --- | |
# gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True) | |
# def tts(): | |
# while True: | |
# contents = '' | |
# while not gradio_backend.contents.empty(): | |
# content = gradio_backend.contents.get() | |
# contents += ' ' + content.rstrip(' ...') | |
# contents = contents.strip() | |
# if contents: | |
# generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2) | |
# for _, _, audio_torch in generator: | |
# audio_np = audio_torch.cpu().numpy() | |
# max_val = np.max(np.abs(audio_np)) | |
# if max_val > 0: | |
# audio_np = audio_np / max_val | |
# audio_int16 = (audio_np * 32767).astype(np.int16) | |
# yield (24000, audio_int16) | |
# gr_video.change(fn=tts, outputs=[gr_tts]) |