Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,401 Bytes
82c2aee c1fab3e 82c2aee c1fab3e 82c2aee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import os
import gradio as gr
from demo.infer import LiveCCDemoInfer
class GradioBackend:
waiting_video_response = 'Waiting for video input...'
not_found_video_response = 'Video does not exist...'
mode2api = {
'Real-Time Commentary': 'live_cc',
'Conversation': 'video_qa'
}
def __init__(self, model_path: str = 'chenjoya/LiveCC-7B-Instruct'):
self.infer = LiveCCDemoInfer(model_path)
from kokoro import KPipeline
self.audio_pipeline = KPipeline(lang_code='a')
def __call__(self, query: str = None, state: dict = {}, mode: str = 'Real-Time Commentary', **kwargs):
return getattr(self.infer, self.mode2api[mode])(query=query, state=state, **kwargs)
gradio_backend = GradioBackend()
with gr.Blocks() as demo:
gr.Markdown("## LiveCC Real-Time Commentary and Conversation - Gradio Demo")
gr.Markdown("#### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale](https://showlab.github.io/livecc/)")
gr_state = gr.State({}, render=False) # control all useful state, including kv cache
gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight
gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop
gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh
with gr.Row():
with gr.Column():
gr_video = gr.Video(
label="video",
elem_id="gr_video",
visible=True,
sources=['upload'],
autoplay=True,
include_audio=False,
width=720,
height=480
)
gr_examples = gr.Examples(
examples=[
'demo/sources/howto_fix_laptop_mute_1080p.mp4',
],
inputs=[gr_video],
)
gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button")
with gr.Column():
with gr.Row():
gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True)
def gr_chatinterface_fn(message, history, state, mode):
response, state = gradio_backend(query=message, state=state, mode=mode)
return response, state
def gr_chatinterface_chatbot_clear_fn():
return {}, {}, 0, 0
gr_chatinterface = gr.ChatInterface(
fn=gr_chatinterface_fn,
type="messages",
additional_inputs=[gr_state, gr_radio_mode],
additional_outputs=[gr_state],
)
gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int):
# if static_trigger == 0:
# return gr_chatinterface_chatbot_clear_fn()
# if video_state['video_path'] != state.get('video_path', None):
# return gr_chatinterface_chatbot_clear_fn()
state.update(video_state)
query, assistant_waiting_message = None, None
for message in history[::-1]:
if message['role'] == 'user':
if message['metadata'] is None or message['metadata'].get('status', '') == '':
query = message['content']
if message['metadata'] is None:
message['metadata'] = {}
message['metadata']['status'] = 'pending'
continue
if query is not None: # put others as done
message['metadata']['status'] = 'done'
elif message['content'] == GradioBackend.waiting_video_response:
assistant_waiting_message = message
for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode):
if start_timestamp >= 0:
response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}'
if assistant_waiting_message is None:
history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp))
else:
assistant_waiting_message['content'] = response_with_timestamp
assistant_waiting_message = None
yield history, state, dynamic_trigger
yield history, state, 1 - dynamic_trigger
js_video_timestamp_fetcher = """
(state, video_state) => {
const videoEl = document.querySelector("#gr_video video");
return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime };
}
"""
gr_video.change(fn=lambda :[1,1], outputs=[gr_static_trigger, gr_dynamic_trigger])
def gr_get_video_state(video_state):
print(video_state)
if 'file=' in video_state['video_path']:
video_state['video_path'] = video_state['video_path'].split('file=')[1]
return video_state
gr_dynamic_trigger.change(
fn=gr_get_video_state,
inputs=[gr_video_state],
outputs=[gr_video_state],
js=js_video_timestamp_fetcher
).then(
fn=gr_for_streaming,
inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger],
outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger],
)
demo.queue(max_size=5, default_concurrency_limit=5)
demo.launch(share=True)
# --- for streaming ---
# gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True)
# def tts():
# while True:
# contents = ''
# while not gradio_backend.contents.empty():
# content = gradio_backend.contents.get()
# contents += ' ' + content.rstrip(' ...')
# contents = contents.strip()
# if contents:
# generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2)
# for _, _, audio_torch in generator:
# audio_np = audio_torch.cpu().numpy()
# max_val = np.max(np.abs(audio_np))
# if max_val > 0:
# audio_np = audio_np / max_val
# audio_int16 = (audio_np * 32767).astype(np.int16)
# yield (24000, audio_int16)
# gr_video.change(fn=tts, outputs=[gr_tts]) |