Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,110 Bytes
c1fab3e 82c2aee c1fab3e 82c2aee e0c12ec 64d3228 82c2aee a61ec88 82c2aee a61ec88 82c2aee a61ec88 82c2aee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import gradio as gr
from demo.infer import LiveCCDemoInfer
class GradioBackend:
waiting_video_response = 'Waiting for video input...'
not_found_video_response = 'Video does not exist...'
mode2api = {
'Real-Time Commentary': 'live_cc',
'Conversation': 'video_qa'
}
def __init__(self, model_path: str = 'chenjoya/LiveCC-7B-Instruct'):
self.infer = LiveCCDemoInfer(model_path)
from kokoro import KPipeline
self.audio_pipeline = KPipeline(lang_code='a')
def __call__(self, query: str = None, state: dict = {}, mode: str = 'Real-Time Commentary', **kwargs):
return getattr(self.infer, self.mode2api[mode])(query=query, state=state, **kwargs)
gradio_backend = GradioBackend()
with gr.Blocks() as demo:
gr.Markdown("## LiveCC Real-Time Commentary and Conversation - Gradio Demo")
gr.Markdown("### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale (CVPR 2025)](https://showlab.github.io/livecc/)")
gr.Markdown("1️⃣ Select Mode, Real-Time Commentary (LiveCC) or Conversation (Common QA)")
gr.Markdown("2️⃣ (Optional) Input a query. If there is no query, the default query is 'Please describe the video.' ")
gr.Markdown("3️⃣ Click the video, or upload a video.")
gr.Markdown("*Web Gradio has unexpected latency (3s~5s). If you want to enjoy the very real-time experience, please try our CLI demo https://github.com/showlab/livecc*")
gr_state = gr.State({}, render=False) # control all useful state, including kv cache
gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight
gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop
gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh
with gr.Row():
with gr.Column():
gr_video = gr.Video(
label="video",
elem_id="gr_video",
visible=True,
sources=['upload'],
autoplay=True,
include_audio=False,
width=720,
height=480
)
gr_examples = gr.Examples(
examples=[
'demo/sources/howto_fix_laptop_mute_1080p.mp4',
],
inputs=[gr_video],
)
gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button")
with gr.Column():
with gr.Row():
gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True)
def gr_chatinterface_fn(message, history, state, video_path, mode):
state['video_path'] = video_path
response, state = gradio_backend(query=message, state=state, mode=mode)
return response, state
def gr_chatinterface_chatbot_clear_fn():
return {}, {}, 0, 0
gr_chatinterface = gr.ChatInterface(
fn=gr_chatinterface_fn,
type="messages",
additional_inputs=[gr_state, gr_video, gr_radio_mode],
additional_outputs=[gr_state],
)
gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int):
# if static_trigger == 0:
# return gr_chatinterface_chatbot_clear_fn()
# if video_state['video_path'] != state.get('video_path', None):
# return gr_chatinterface_chatbot_clear_fn()
state.update(video_state)
query, assistant_waiting_message = None, None
for message in history[::-1]:
if message['role'] == 'user':
if message['metadata'] is None or message['metadata'].get('status', '') == '':
query = message['content']
if message['metadata'] is None:
message['metadata'] = {}
message['metadata']['status'] = 'pending'
continue
if query is not None: # put others as done
message['metadata']['status'] = 'done'
elif message['content'] == GradioBackend.waiting_video_response:
assistant_waiting_message = message
for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode):
if start_timestamp >= 0:
response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}'
if assistant_waiting_message is None:
history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp))
else:
assistant_waiting_message['content'] = response_with_timestamp
assistant_waiting_message = None
yield history, state, dynamic_trigger
yield history, state, 1 - dynamic_trigger
js_video_timestamp_fetcher = """
(state, video_state) => {
const videoEl = document.querySelector("#gr_video video");
return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime };
}
"""
def gr_get_video_state(video_state):
if 'file=' in video_state['video_path']:
video_state['video_path'] = video_state['video_path'].split('file=')[1]
return video_state
def gr_video_change_fn(mode):
return [1, 1] if mode == "Real-Time Commentary" else [0, 0]
gr_video.change(
fn=gr_video_change_fn,
inputs=[gr_radio_mode],
outputs=[gr_static_trigger, gr_dynamic_trigger]
)
gr_dynamic_trigger.change(
fn=gr_get_video_state,
inputs=[gr_video_state],
outputs=[gr_video_state],
js=js_video_timestamp_fetcher
).then(
fn=gr_for_streaming,
inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger],
outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger],
)
demo.queue(max_size=5, default_concurrency_limit=5)
demo.launch(share=True)
# --- for streaming ---
# gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True)
# def tts():
# while True:
# contents = ''
# while not gradio_backend.contents.empty():
# content = gradio_backend.contents.get()
# contents += ' ' + content.rstrip(' ...')
# contents = contents.strip()
# if contents:
# generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2)
# for _, _, audio_torch in generator:
# audio_np = audio_torch.cpu().numpy()
# max_val = np.max(np.abs(audio_np))
# if max_val > 0:
# audio_np = audio_np / max_val
# audio_int16 = (audio_np * 32767).astype(np.int16)
# yield (24000, audio_int16)
# gr_video.change(fn=tts, outputs=[gr_tts]) |