Spaces:

chenjoya
/

LiveCC

Running on Zero

App Files Files Community

LiveCC / app.py

chenjoya

Update app.py

267e889 verified 11 days ago

raw

history blame

10.2 kB

	import spaces, os
	import gradio as gr
	from kokoro import KPipeline
	from qwen_vl_utils import process_vision_info

	from demo.infer import LiveCCDemoInfer

	model_path = 'chenjoya/LiveCC-7B-Instruct'

	def _init_infer():
	# create a singleton LiveCCDemoInfer inside GPU
	import torch
	from kokoro import KPipeline
	from demo.infer import LiveCCDemoInfer
	infer = LiveCCDemoInfer(model_path, device='cuda')
	return infer

	# We'll keep a module-global placeholder
	infer = None

	with gr.Blocks() as demo:
	gr.Markdown("## LiveCC Conversation and Real-Time Commentary - Gradio Demo")
	gr.Markdown("### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale (CVPR 2025)](https://showlab.github.io/livecc/)")
	gr.Markdown("1️⃣ Select Mode, Real-Time Commentary (LiveCC) or Conversation (Common QA)")
	gr.Markdown("2️⃣🅰️ Real-Time Commentary: Input a query (optional) -> Click or upload a video.")
	gr.Markdown("2️⃣🅱️ Conversation: Click or upload a video -> Input a query.")
	gr.Markdown("Web Gradio has unexpected latency (3s~5s). If you want to enjoy the very real-time experience, please deploy locally https://github.com/showlab/livecc")
	gr_state = gr.State({}, render=False) # control all useful state, including kv cache
	gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight
	gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop
	gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh

	with gr.Row():
	with gr.Column():
	gr_video = gr.Video(
	label="video",
	elem_id="gr_video",
	visible=True,
	sources=['upload'],
	autoplay=True,
	include_audio=False,
	width=720,
	height=480
	)
	gr_examples = gr.Examples(
	examples=[
	'demo/sources/howto_fix_laptop_mute_1080p.mp4',
	'demo/sources/writing_mute_1080p.mp4'
	],
	inputs=[gr_video],
	)
	gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button")

	with gr.Column():
	with gr.Row():
	gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True)

	@spaces.GPU
	def gr_chatinterface_fn(message, history, state, video_path, mode):
	global infer
	yield '(initializing model, thanks for waiting...)', state
	if infer is None:
	infer = _init_infer()
	state['video_path'] = video_path
	yield '(finished initialization, responding...)', state
	if mode != 'Conversation':
	yield 'waiting video input...', state
	query = message
	if video_path:
	message = {
	"role": "user",
	"content": [
	{"type": "video", "video": video_path},
	{"type": "text", "text": query},
	],
	}

	else:
	message = {
	"role": "user",
	"content": [
	{"type": "text", "text": query},
	],
	}
	image_inputs, video_inputs = process_vision_info([message])
	texts = infer.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True, return_tensors='pt')
	past_ids = state.get('past_ids', None)
	if past_ids is not None:
	texts = '<\|im_end\|>\n' + texts[infer.system_prompt_offset:]
	inputs = infer.processor(
	text=texts,
	images=image_inputs,
	videos=video_inputs,
	return_tensors="pt",
	)
	inputs.to(infer.model.device)
	if past_ids is not None:
	inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
	outputs = infer.model.generate(
	**inputs, past_key_values=state.get('past_key_values', None),
	return_dict_in_generate=True, do_sample=False,
	repetition_penalty=1.05,
	max_new_tokens=512,
	)
	state['past_key_values'] = outputs.past_key_values
	state['past_ids'] = outputs.sequences[:, :-1]
	response = infer.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
	print(response)
	yield response, {}

	def gr_chatinterface_chatbot_clear_fn():
	return {}, {}, 0, 0
	gr_chatinterface = gr.ChatInterface(
	fn=gr_chatinterface_fn,
	type="messages",
	additional_inputs=[gr_state, gr_video, gr_radio_mode],
	additional_outputs=[gr_state],
	)
	gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])
	gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger])

	def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int):
	# if static_trigger == 0:
	# return gr_chatinterface_chatbot_clear_fn()
	# if video_state['video_path'] != state.get('video_path', None):
	# return gr_chatinterface_chatbot_clear_fn()
	state.update(video_state)
	query, assistant_waiting_message = None, None
	for message in history[::-1]:
	if message['role'] == 'user':
	if message['metadata'] is None or message['metadata'].get('status', '') == '':
	query = message['content']
	if message['metadata'] is None:
	message['metadata'] = {}
	message['metadata']['status'] = 'pending'
	continue
	if query is not None: # put others as done
	message['metadata']['status'] = 'done'
	elif message['content'] == GradioBackend.waiting_video_response:
	assistant_waiting_message = message

	for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode):
	if start_timestamp >= 0:
	response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}'
	if assistant_waiting_message is None:
	history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp))
	else:
	assistant_waiting_message['content'] = response_with_timestamp
	assistant_waiting_message = None
	yield history, state, dynamic_trigger
	yield history, state, 1 - dynamic_trigger

	js_video_timestamp_fetcher = """
	(state, video_state) => {
	const videoEl = document.querySelector("#gr_video video");
	return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime };
	}
	"""

	def gr_get_video_state(video_state):
	if 'file=' in video_state['video_path']:
	video_state['video_path'] = video_state['video_path'].split('file=')[1]
	return video_state
	def gr_video_change_fn(mode):
	return [1, 1] if mode == "Real-Time Commentary" else [0, 0]
	gr_video.change(
	fn=gr_video_change_fn,
	inputs=[gr_radio_mode],
	outputs=[gr_static_trigger, gr_dynamic_trigger]
	)
	gr_dynamic_trigger.change(
	fn=gr_get_video_state,
	inputs=[gr_video_state],
	outputs=[gr_video_state],
	js=js_video_timestamp_fetcher
	).then(
	fn=gr_for_streaming,
	inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger],
	outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger],
	)

	demo.queue(max_size=5, default_concurrency_limit=5)
	demo.launch(share=True)


	# --- for streaming ---

	# gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True)
	# def tts():
	# while True:
	# contents = ''
	# while not gradio_backend.contents.empty():
	# content = gradio_backend.contents.get()
	# contents += ' ' + content.rstrip(' ...')
	# contents = contents.strip()
	# if contents:
	# generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2)
	# for _, _, audio_torch in generator:
	# audio_np = audio_torch.cpu().numpy()
	# max_val = np.max(np.abs(audio_np))
	# if max_val > 0:
	# audio_np = audio_np / max_val
	# audio_int16 = (audio_np * 32767).astype(np.int16)
	# yield (24000, audio_int16)
	# gr_video.change(fn=tts, outputs=[gr_tts])