Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,075 Bytes
fdcc35d c1fab3e 2d77322 c1fab3e 82c2aee c1fab3e 2d77322 fdcc35d ceeb44d 82c2aee ceeb44d 2d77322 82c2aee e0c12ec 64d3228 2225102 5d35e25 82c2aee bc5cb8c 82c2aee 5d35e25 82c2aee 2d77322 5106b7a ceeb44d a61ec88 ceeb44d 82c2aee a61ec88 82c2aee a61ec88 82c2aee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import spaces, os
import gradio as gr
from kokoro import KPipeline
from demo.infer import LiveCCDemoInfer
model_path = 'chenjoya/LiveCC-7B-Instruct'
def _init_infer():
# create a singleton LiveCCDemoInfer inside GPU
import torch
from kokoro import KPipeline
from demo.infer import LiveCCDemoInfer
infer = LiveCCDemoInfer(model_path, device='cuda')
return infer
# We'll keep a module-global placeholder
infer = None
with gr.Blocks() as demo:
gr.Markdown("## LiveCC Real-Time Commentary and Conversation - Gradio Demo")
gr.Markdown("### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale (CVPR 2025)](https://showlab.github.io/livecc/)")
gr.Markdown("1️⃣ Select Mode, Real-Time Commentary (LiveCC) or Conversation (Common QA)")
gr.Markdown("2️⃣🅰️ Real-Time Commentary: Input a query (optional) -> Click or upload a video.")
gr.Markdown("2️⃣🅱️ Conversation: Click or upload a video -> Input a query.")
gr.Markdown("*Web Gradio has unexpected latency (3s~5s). If you want to enjoy the very real-time experience, please deploy locally https://github.com/showlab/livecc*")
gr_state = gr.State({}, render=False) # control all useful state, including kv cache
gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight
gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop
gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh
with gr.Row():
with gr.Column():
gr_video = gr.Video(
label="video",
elem_id="gr_video",
visible=True,
sources=['upload'],
autoplay=True,
include_audio=False,
width=720,
height=480
)
gr_examples = gr.Examples(
examples=[
'demo/sources/howto_fix_laptop_mute_1080p.mp4',
'demo/sources/writing_mute_1080p.mp4'
],
inputs=[gr_video],
)
gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button")
with gr.Column():
with gr.Row():
gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True)
@spaces.GPU
def gr_chatinterface_fn(message, history, state, video_path, mode):
global infer
if infer is None:
infer = _init_infer()
state['video_path'] = video_path
if mode == 'Conversation':
return infer.video_qa(query=message, state=state)
else:
return 'waiting video input...'
def gr_chatinterface_chatbot_clear_fn():
return {}, {}, 0, 0
gr_chatinterface = gr.ChatInterface(
fn=gr_chatinterface_fn,
type="messages",
additional_inputs=[gr_state, gr_video, gr_radio_mode],
additional_outputs=[gr_state],
)
gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int):
# if static_trigger == 0:
# return gr_chatinterface_chatbot_clear_fn()
# if video_state['video_path'] != state.get('video_path', None):
# return gr_chatinterface_chatbot_clear_fn()
state.update(video_state)
query, assistant_waiting_message = None, None
for message in history[::-1]:
if message['role'] == 'user':
if message['metadata'] is None or message['metadata'].get('status', '') == '':
query = message['content']
if message['metadata'] is None:
message['metadata'] = {}
message['metadata']['status'] = 'pending'
continue
if query is not None: # put others as done
message['metadata']['status'] = 'done'
elif message['content'] == GradioBackend.waiting_video_response:
assistant_waiting_message = message
for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode):
if start_timestamp >= 0:
response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}'
if assistant_waiting_message is None:
history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp))
else:
assistant_waiting_message['content'] = response_with_timestamp
assistant_waiting_message = None
yield history, state, dynamic_trigger
yield history, state, 1 - dynamic_trigger
js_video_timestamp_fetcher = """
(state, video_state) => {
const videoEl = document.querySelector("#gr_video video");
return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime };
}
"""
def gr_get_video_state(video_state):
if 'file=' in video_state['video_path']:
video_state['video_path'] = video_state['video_path'].split('file=')[1]
return video_state
def gr_video_change_fn(mode):
return [1, 1] if mode == "Real-Time Commentary" else [0, 0]
gr_video.change(
fn=gr_video_change_fn,
inputs=[gr_radio_mode],
outputs=[gr_static_trigger, gr_dynamic_trigger]
)
gr_dynamic_trigger.change(
fn=gr_get_video_state,
inputs=[gr_video_state],
outputs=[gr_video_state],
js=js_video_timestamp_fetcher
).then(
fn=gr_for_streaming,
inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger],
outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger],
)
demo.queue(max_size=5, default_concurrency_limit=5)
demo.launch(share=True)
# --- for streaming ---
# gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True)
# def tts():
# while True:
# contents = ''
# while not gradio_backend.contents.empty():
# content = gradio_backend.contents.get()
# contents += ' ' + content.rstrip(' ...')
# contents = contents.strip()
# if contents:
# generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2)
# for _, _, audio_torch in generator:
# audio_np = audio_torch.cpu().numpy()
# max_val = np.max(np.abs(audio_np))
# if max_val > 0:
# audio_np = audio_np / max_val
# audio_int16 = (audio_np * 32767).astype(np.int16)
# yield (24000, audio_int16)
# gr_video.change(fn=tts, outputs=[gr_tts]) |