Spaces:
Running
on
Zero
Running
on
Zero
File size: 10,230 Bytes
fdcc35d c1fab3e 2d77322 d181208 c1fab3e 82c2aee c1fab3e 2d77322 fdcc35d ceeb44d 82c2aee ceeb44d 2d77322 17f511e e0c12ec 64d3228 2225102 5d35e25 82c2aee bc5cb8c 82c2aee 5d35e25 82c2aee 2d77322 5106b7a ceeb44d d181208 ceeb44d a61ec88 d181208 a76995d 641455e a76995d ceeb44d a76995d 641455e a76995d 851054e a76995d 17f511e 0667a5e a76995d 82c2aee a61ec88 82c2aee a61ec88 82c2aee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import spaces, os
import gradio as gr
from kokoro import KPipeline
from qwen_vl_utils import process_vision_info
from demo.infer import LiveCCDemoInfer
model_path = 'chenjoya/LiveCC-7B-Instruct'
def _init_infer():
# create a singleton LiveCCDemoInfer inside GPU
import torch
from kokoro import KPipeline
from demo.infer import LiveCCDemoInfer
infer = LiveCCDemoInfer(model_path, device='cuda')
return infer
# We'll keep a module-global placeholder
infer = None
with gr.Blocks() as demo:
gr.Markdown("## LiveCC Conversation and Real-Time Commentary - Gradio Demo")
gr.Markdown("### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale (CVPR 2025)](https://showlab.github.io/livecc/)")
gr.Markdown("1️⃣ Select Mode, Real-Time Commentary (LiveCC) or Conversation (Common QA)")
gr.Markdown("2️⃣🅰️ Real-Time Commentary: Input a query (optional) -> Click or upload a video.")
gr.Markdown("2️⃣🅱️ Conversation: Click or upload a video -> Input a query.")
gr.Markdown("*Web Gradio has unexpected latency (3s~5s). If you want to enjoy the very real-time experience, please deploy locally https://github.com/showlab/livecc*")
gr_state = gr.State({}, render=False) # control all useful state, including kv cache
gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight
gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop
gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh
with gr.Row():
with gr.Column():
gr_video = gr.Video(
label="video",
elem_id="gr_video",
visible=True,
sources=['upload'],
autoplay=True,
include_audio=False,
width=720,
height=480
)
gr_examples = gr.Examples(
examples=[
'demo/sources/howto_fix_laptop_mute_1080p.mp4',
'demo/sources/writing_mute_1080p.mp4'
],
inputs=[gr_video],
)
gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button")
with gr.Column():
with gr.Row():
gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True)
@spaces.GPU
def gr_chatinterface_fn(message, history, state, video_path, mode):
global infer
yield '(initializing model, thanks for waiting...)', state
if infer is None:
infer = _init_infer()
state['video_path'] = video_path
yield '(finished initialization, responding...)', state
if mode != 'Conversation':
yield 'waiting video input...', state
query = message
if video_path:
message = {
"role": "user",
"content": [
{"type": "video", "video": video_path},
{"type": "text", "text": query},
],
}
else:
message = {
"role": "user",
"content": [
{"type": "text", "text": query},
],
}
image_inputs, video_inputs = process_vision_info([message])
texts = infer.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True, return_tensors='pt')
past_ids = state.get('past_ids', None)
if past_ids is not None:
texts = '<|im_end|>\n' + texts[infer.system_prompt_offset:]
inputs = infer.processor(
text=texts,
images=image_inputs,
videos=video_inputs,
return_tensors="pt",
)
inputs.to(infer.model.device)
if past_ids is not None:
inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
outputs = infer.model.generate(
**inputs, past_key_values=state.get('past_key_values', None),
return_dict_in_generate=True, do_sample=False,
repetition_penalty=1.05,
max_new_tokens=512,
)
state['past_key_values'] = outputs.past_key_values
state['past_ids'] = outputs.sequences[:, :-1]
response = infer.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
print(response)
yield response, {}
def gr_chatinterface_chatbot_clear_fn():
return {}, {}, 0, 0
gr_chatinterface = gr.ChatInterface(
fn=gr_chatinterface_fn,
type="messages",
additional_inputs=[gr_state, gr_video, gr_radio_mode],
additional_outputs=[gr_state],
)
gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int):
# if static_trigger == 0:
# return gr_chatinterface_chatbot_clear_fn()
# if video_state['video_path'] != state.get('video_path', None):
# return gr_chatinterface_chatbot_clear_fn()
state.update(video_state)
query, assistant_waiting_message = None, None
for message in history[::-1]:
if message['role'] == 'user':
if message['metadata'] is None or message['metadata'].get('status', '') == '':
query = message['content']
if message['metadata'] is None:
message['metadata'] = {}
message['metadata']['status'] = 'pending'
continue
if query is not None: # put others as done
message['metadata']['status'] = 'done'
elif message['content'] == GradioBackend.waiting_video_response:
assistant_waiting_message = message
for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode):
if start_timestamp >= 0:
response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}'
if assistant_waiting_message is None:
history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp))
else:
assistant_waiting_message['content'] = response_with_timestamp
assistant_waiting_message = None
yield history, state, dynamic_trigger
yield history, state, 1 - dynamic_trigger
js_video_timestamp_fetcher = """
(state, video_state) => {
const videoEl = document.querySelector("#gr_video video");
return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime };
}
"""
def gr_get_video_state(video_state):
if 'file=' in video_state['video_path']:
video_state['video_path'] = video_state['video_path'].split('file=')[1]
return video_state
def gr_video_change_fn(mode):
return [1, 1] if mode == "Real-Time Commentary" else [0, 0]
gr_video.change(
fn=gr_video_change_fn,
inputs=[gr_radio_mode],
outputs=[gr_static_trigger, gr_dynamic_trigger]
)
gr_dynamic_trigger.change(
fn=gr_get_video_state,
inputs=[gr_video_state],
outputs=[gr_video_state],
js=js_video_timestamp_fetcher
).then(
fn=gr_for_streaming,
inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger],
outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger],
)
demo.queue(max_size=5, default_concurrency_limit=5)
demo.launch(share=True)
# --- for streaming ---
# gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True)
# def tts():
# while True:
# contents = ''
# while not gradio_backend.contents.empty():
# content = gradio_backend.contents.get()
# contents += ' ' + content.rstrip(' ...')
# contents = contents.strip()
# if contents:
# generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2)
# for _, _, audio_torch in generator:
# audio_np = audio_torch.cpu().numpy()
# max_val = np.max(np.abs(audio_np))
# if max_val > 0:
# audio_np = audio_np / max_val
# audio_int16 = (audio_np * 32767).astype(np.int16)
# yield (24000, audio_int16)
# gr_video.change(fn=tts, outputs=[gr_tts]) |