Spaces:

chenjoya
/

LiveCC

Running on Zero

File size: 10,230 Bytes

fdcc35d
c1fab3e
2d77322
d181208
c1fab3e
82c2aee
c1fab3e
2d77322
fdcc35d
ceeb44d
 
 
 
 
 
 
82c2aee
ceeb44d
 
2d77322
 
17f511e
e0c12ec
64d3228
2225102
 
5d35e25
82c2aee
 
 
 
bc5cb8c
82c2aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d35e25
82c2aee
 
 
 
 
 
 
 
 
2d77322
5106b7a
ceeb44d
d181208
ceeb44d
 
a61ec88
d181208
a76995d
 
 
 
 
 
 
 
641455e
a76995d
 
 
ceeb44d
a76995d
 
 
641455e
a76995d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
851054e
 
a76995d
 
 
 
 
17f511e
0667a5e
a76995d
82c2aee
 
 
 
 
a61ec88
82c2aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a61ec88
 
 
 
 
 
 
82c2aee

import spaces, os
import gradio as gr
from kokoro import KPipeline
from qwen_vl_utils import process_vision_info

from demo.infer import LiveCCDemoInfer

model_path = 'chenjoya/LiveCC-7B-Instruct'

def _init_infer():
    # create a singleton LiveCCDemoInfer inside GPU
    import torch
    from kokoro import KPipeline
    from demo.infer import LiveCCDemoInfer
    infer = LiveCCDemoInfer(model_path, device='cuda')
    return infer

# We'll keep a module-global placeholder
infer = None
            
with gr.Blocks() as demo:
    gr.Markdown("## LiveCC Conversation and Real-Time Commentary - Gradio Demo")
    gr.Markdown("### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale (CVPR 2025)](https://showlab.github.io/livecc/)")
    gr.Markdown("1️⃣ Select Mode, Real-Time Commentary (LiveCC) or Conversation (Common QA)")
    gr.Markdown("2️⃣🅰️ Real-Time Commentary:  Input a query (optional) -> Click or upload a video.")
    gr.Markdown("2️⃣🅱️ Conversation: Click or upload a video -> Input a query.")
    gr.Markdown("*Web Gradio has unexpected latency (3s~5s). If you want to enjoy the very real-time experience, please deploy locally https://github.com/showlab/livecc*")
    gr_state = gr.State({}, render=False) # control all useful state, including kv cache
    gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight
    gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop
    gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh 
    
    with gr.Row():
        with gr.Column():
            gr_video = gr.Video(
                label="video",
                elem_id="gr_video",
                visible=True,
                sources=['upload'],
                autoplay=True,
                include_audio=False,
                width=720,
                height=480
            )
            gr_examples = gr.Examples(
                examples=[
                    'demo/sources/howto_fix_laptop_mute_1080p.mp4',
                    'demo/sources/writing_mute_1080p.mp4'
                ],
                inputs=[gr_video],
            )
            gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button")

        with gr.Column():
            with gr.Row():
                gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True) 

            @spaces.GPU
            def gr_chatinterface_fn(message, history, state, video_path, mode):
                global infer
                yield '(initializing model, thanks for waiting...)', state
                if infer is None:
                    infer = _init_infer()
                state['video_path'] = video_path
                yield '(finished initialization, responding...)', state
                if mode != 'Conversation':
                    yield 'waiting video input...', state
                query = message
                if video_path:
                    message = {
                        "role": "user",
                        "content": [
                            {"type": "video", "video": video_path},
                            {"type": "text", "text": query},
                        ],
                    }
                    
                else:
                    message = {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": query},
                        ],
                    }
                image_inputs, video_inputs = process_vision_info([message])
                texts = infer.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True, return_tensors='pt')
                past_ids = state.get('past_ids', None)
                if past_ids is not None:
                    texts = '<|im_end|>\n' + texts[infer.system_prompt_offset:]
                inputs = infer.processor(
                    text=texts,
                    images=image_inputs,
                    videos=video_inputs,
                    return_tensors="pt",
                )
                inputs.to(infer.model.device)
                if past_ids is not None:
                    inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1) 
                outputs = infer.model.generate(
                    **inputs, past_key_values=state.get('past_key_values', None), 
                    return_dict_in_generate=True, do_sample=False, 
                    repetition_penalty=1.05,
                    max_new_tokens=512,
                )
                state['past_key_values'] = outputs.past_key_values
                state['past_ids'] = outputs.sequences[:, :-1]
                response = infer.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
                print(response)
                yield response, {}                
                
            def gr_chatinterface_chatbot_clear_fn():
                return {}, {}, 0, 0
            gr_chatinterface = gr.ChatInterface(
                fn=gr_chatinterface_fn,
                type="messages", 
                additional_inputs=[gr_state, gr_video, gr_radio_mode],
                additional_outputs=[gr_state],
            )
            gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
            gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
            
            def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int): 
                # if static_trigger == 0:
                #     return gr_chatinterface_chatbot_clear_fn()
                # if video_state['video_path'] != state.get('video_path', None):
                #     return gr_chatinterface_chatbot_clear_fn()
                state.update(video_state)
                query, assistant_waiting_message = None, None
                for message in history[::-1]:
                    if message['role'] == 'user':
                        if message['metadata'] is None or message['metadata'].get('status', '') == '':
                            query = message['content']
                            if message['metadata'] is None:
                                message['metadata'] = {}
                            message['metadata']['status'] = 'pending'
                            continue
                        if query is not None: # put others as done
                            message['metadata']['status'] = 'done'
                    elif message['content'] == GradioBackend.waiting_video_response:
                        assistant_waiting_message = message
                
                for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode):
                    if start_timestamp >= 0:
                        response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}'
                        if assistant_waiting_message is None:
                            history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp))
                        else:
                            assistant_waiting_message['content'] = response_with_timestamp
                            assistant_waiting_message = None
                        yield history, state, dynamic_trigger
                yield history, state, 1 - dynamic_trigger
            
            js_video_timestamp_fetcher = """
                (state, video_state) => {
                    const videoEl = document.querySelector("#gr_video video");
                    return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime };
                }
            """

            def gr_get_video_state(video_state):
                if 'file=' in video_state['video_path']:
                    video_state['video_path'] = video_state['video_path'].split('file=')[1]
                return video_state
            def gr_video_change_fn(mode):
                return [1, 1] if mode == "Real-Time Commentary" else [0, 0]
            gr_video.change(
                fn=gr_video_change_fn, 
                inputs=[gr_radio_mode], 
                outputs=[gr_static_trigger, gr_dynamic_trigger]
            )
            gr_dynamic_trigger.change(
                fn=gr_get_video_state,
                inputs=[gr_video_state],
                outputs=[gr_video_state],
                js=js_video_timestamp_fetcher
            ).then(
                fn=gr_for_streaming, 
                inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger], 
                outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger], 
            )
            
    demo.queue(max_size=5, default_concurrency_limit=5)
    demo.launch(share=True)


    # --- for streaming ---

    # gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True)
    # def tts():
    #     while True:
    #         contents = ''
    #         while not gradio_backend.contents.empty():
    #             content = gradio_backend.contents.get()
    #             contents += ' ' + content.rstrip(' ...')
    #         contents = contents.strip()
    #         if contents:
    #             generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2)
    #             for _, _, audio_torch in generator:
    #                 audio_np = audio_torch.cpu().numpy()
    #                 max_val = np.max(np.abs(audio_np))
    #                 if max_val > 0:
    #                     audio_np = audio_np / max_val
    #                 audio_int16 = (audio_np * 32767).astype(np.int16)
    #                 yield (24000, audio_int16)
    # gr_video.change(fn=tts, outputs=[gr_tts])