vicuna-clip

Running on Zero

App Files Files Community

vicuna-clip / app.py

1inkusFace

Update app.py

4c4f28d verified 24 days ago

raw

history blame contribute delete

11.7 kB

	import spaces

	import torch
	import gradio as gr
	from transformers import pipeline, AutoModel, LlamaTokenizer, LlamaForCausalLM, InstructBlipForConditionalGeneration, InstructBlipProcessor
	import numpy as np
	#import yaml
	#import os
	import requests
	import nltk
	import scipy.io.wavfile
	import os
	import subprocess
	from huggingface_hub import hf_hub_download
	from PIL import Image
	subprocess.run(['bash','llama.sh'])
	from llama_cpp import Llama

	os.environ["SAFETENSORS_FAST_GPU"] = "1"
	os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")

	from espnet2.bin.tts_inference import Text2Speech

	repo_id = "Sosaka/Vicuna-7B-4bit-ggml"
	filename = "vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin"
	cache_dir="~/.cache/huggingface/hub"
	#hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
	'''
	llm = Llama(
	model_path="~/.cache/huggingface/hub/vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin",
	n_gpu_layers=-1, # Uncomment to use GPU acceleration
	# seed=1337, # Uncomment to set a specific seed
	n_ctx=4096, # Uncomment to increase the context window
	)

	llm = Llama.from_pretrained(
	repo_id="Sosaka/Vicuna-7B-4bit-ggml",
	filename="vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin",
	n_gpu_layers=-1, # Uncomment to use GPU acceleration
	n_ctx = 4096,
	verbose=False
	)
	'''
	try:
	nltk.data.find('taggers/averaged_perceptron_tagger_eng')
	except LookupError:
	nltk.download('averaged_perceptron_tagger_eng')
	try:
	nltk.data.find('corpora/cmudict') # Check for cmudict
	except LookupError:
	nltk.download('cmudict')

	ASR_MODEL_NAME = "openai/whisper-medium.en"


	def _preload_and_load_models():
	global vicuna_tokenizer, vicuna_model, tts, model5, processor5, asr_pipe
	#VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
	#VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5" # Or another model
	VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5" # Or another model
	vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME)
	vicuna_model = LlamaForCausalLM.from_pretrained(
	VICUNA_MODEL_NAME,
	trust_remote_code=True,
	torch_dtype=torch.float16,
	#device_map="cuda", # or.to('cuda')
	).to(torch.device('cuda'),torch.float16) # Explicitly move to CUDA after loading
	tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
	model5 = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").to(torch.device('cuda'),torch.float16)
	processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
	asr_pipe = pipeline(
	task="automatic-speech-recognition",
	model=ASR_MODEL_NAME,
	chunk_length_s=30,
	device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
	)

	_preload_and_load_models()

	all_special_ids = asr_pipe.tokenizer.all_special_ids
	transcribe_token_id = all_special_ids[-5]
	translate_token_id = all_special_ids[-6]

	cap_prompt = (
	"Describe this image with a caption to be used for question answering."
	)

	@spaces.GPU(required=True)
	def process_audio(img, microphone, audio_upload, state, answer_mode): # Added audio_upload
	audio_source = None
	if microphone:
	audio_source = microphone
	asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
	text = asr_pipe(audio_source)["text"]
	elif audio_upload:
	audio_source = audio_upload
	rate, data = scipy.io.wavfile.read(audio_source)
	asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
	text = asr_pipe(data)["text"]
	else:
	return state, state, None # No audio input
	system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
	You answer questions clearly and simply, using age-appropriate language.
	You are also a little bit silly and like to make jokes."""
	prompt = f"{system_prompt}\nUser: {text}"

	if img is not None:
	sd_image_a = Image.open(img.name).convert('RGB')
	inputsa = processor5(images=sd_image_a, text=cap_prompt, return_tensors="pt").to(torch.device('cuda'))
	sd_image_a.resize((512,512), Image.LANCZOS)
	if answer_mode == 'slow':
	torch.backends.cuda.matmul.allow_tf32 = False
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
	torch.backends.cudnn.allow_tf32 = False
	torch.backends.cudnn.deterministic = False
	torch.backends.cudnn.benchmark = True
	torch.set_float32_matmul_precision("highest")
	if answer_mode == 'medium':
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.deterministic = False
	torch.backends.cudnn.benchmark = False
	torch.set_float32_matmul_precision("high")
	if answer_mode == 'fast':
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False
	# torch.backends.cuda.preferred_blas_library="cublas"
	# torch.backends.cuda.preferred_linalg_library="cusolver"
	torch.set_float32_matmul_precision("medium")
	with torch.no_grad():
	generated_ids = model5.generate(
	**inputsa,
	do_sample=True,
	num_beams=1,
	max_length=96,
	min_length=48,
	top_p=0.9,
	repetition_penalty=1.0,
	length_penalty=2.0,
	temperature=0.5,
	)
	generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
	print(generated_text)
	prompt = f"{system_prompt}\nImage: {generated_text}\nUser: {text}"
	with torch.no_grad():
	vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(torch.device('cuda'))
	if answer_mode == 'slow':
	torch.backends.cuda.matmul.allow_tf32 = False
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
	torch.backends.cudnn.allow_tf32 = False
	torch.backends.cudnn.deterministic = False
	torch.backends.cudnn.benchmark = True
	torch.set_float32_matmul_precision("highest")
	vicuna_output = vicuna_model.generate(
	**vicuna_input,
	max_new_tokens = 512,
	min_new_tokens = 256,
	do_sample = True,
	low_memory = False
	)
	'''
	vicuna_output = llm(
	**vicuna_input,
	max_tokens=96, # Generate up to 32 tokens, set to None to generate up to the end of the context window
	stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
	echo=True # Echo the prompt back in the output
	)
	'''
	if answer_mode == 'medium':
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.deterministic = False
	torch.backends.cudnn.benchmark = False
	torch.set_float32_matmul_precision("high")
	vicuna_output = vicuna_model.generate(
	**vicuna_input,
	max_length = 192,
	min_new_tokens = 64,
	do_sample = True,
	low_memory = False
	)
	if answer_mode == 'fast':
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False
	# torch.backends.cuda.preferred_blas_library="cublas"
	# torch.backends.cuda.preferred_linalg_library="cusolver"
	torch.set_float32_matmul_precision("medium")
	with torch.no_grad():
	vicuna_output = vicuna_model.generate(
	**vicuna_input,
	max_new_tokens = 96,
	min_new_tokens = 16,
	do_sample = True,
	low_memory = True
	)
	vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
	vicuna_response = vicuna_response.replace(prompt, "").strip()
	updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
	try:
	with torch.no_grad():
	output = tts(vicuna_response)
	wav = output["wav"]
	sr = tts.fs
	audio_arr = wav.cpu().numpy()
	SAMPLE_RATE = sr
	audio_arr = audio_arr / np.abs(audio_arr).max()
	audio_output = (SAMPLE_RATE, audio_arr)
	#sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) # Removed writing to file
	except requests.exceptions.RequestException as e:
	print(f"Error in Hugging Face API request: {e}")
	audio_output = None
	except Exception as e:
	print(f"Error in speech synthesis: {e}")
	audio_output = None
	return updated_state, updated_state, audio_output

	with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title
	gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
	gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
	with gr.Tab("Transcribe & Synthesize"):
	with gr.Row(): # Added a row for better layout
	image = gr.File(label="Image Prompt (Optional)")
	mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here", elem_id="mic_audio")
	audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
	transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
	audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
	answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
	transcription_state = gr.State(value="")
	mic_input.change(
	fn=process_audio,
	inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
	outputs=[transcription_output, transcription_state, audio_output]
	)
	audio_upload.change( # Added change event for upload
	fn=process_audio,
	inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
	outputs=[transcription_output, transcription_state, audio_output],
	api_name='/api/predict'
	)

	if __name__ == '__main__':
	demo.launch(share=False)