vicuna-clip / app.py
1inkusFace's picture
Update app.py
4c4f28d verified
raw
history blame contribute delete
11.7 kB
import spaces
import torch
import gradio as gr
from transformers import pipeline, AutoModel, LlamaTokenizer, LlamaForCausalLM, InstructBlipForConditionalGeneration, InstructBlipProcessor
import numpy as np
#import yaml
#import os
import requests
import nltk
import scipy.io.wavfile
import os
import subprocess
from huggingface_hub import hf_hub_download
from PIL import Image
subprocess.run(['bash','llama.sh'])
from llama_cpp import Llama
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
from espnet2.bin.tts_inference import Text2Speech
repo_id = "Sosaka/Vicuna-7B-4bit-ggml"
filename = "vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin"
cache_dir="~/.cache/huggingface/hub"
#hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
'''
llm = Llama(
model_path="~/.cache/huggingface/hub/vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin",
n_gpu_layers=-1, # Uncomment to use GPU acceleration
# seed=1337, # Uncomment to set a specific seed
n_ctx=4096, # Uncomment to increase the context window
)
llm = Llama.from_pretrained(
repo_id="Sosaka/Vicuna-7B-4bit-ggml",
filename="vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin",
n_gpu_layers=-1, # Uncomment to use GPU acceleration
n_ctx = 4096,
verbose=False
)
'''
try:
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
nltk.download('averaged_perceptron_tagger_eng')
try:
nltk.data.find('corpora/cmudict') # Check for cmudict
except LookupError:
nltk.download('cmudict')
ASR_MODEL_NAME = "openai/whisper-medium.en"
def _preload_and_load_models():
global vicuna_tokenizer, vicuna_model, tts, model5, processor5, asr_pipe
#VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
#VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5" # Or another model
VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5" # Or another model
vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME)
vicuna_model = LlamaForCausalLM.from_pretrained(
VICUNA_MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.float16,
#device_map="cuda", # or.to('cuda')
).to(torch.device('cuda'),torch.float16) # Explicitly move to CUDA after loading
tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
model5 = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").to(torch.device('cuda'),torch.float16)
processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
asr_pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
chunk_length_s=30,
device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
)
_preload_and_load_models()
all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]
cap_prompt = (
"Describe this image with a caption to be used for question answering."
)
@spaces.GPU(required=True)
def process_audio(img, microphone, audio_upload, state, answer_mode): # Added audio_upload
audio_source = None
if microphone:
audio_source = microphone
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
text = asr_pipe(audio_source)["text"]
elif audio_upload:
audio_source = audio_upload
rate, data = scipy.io.wavfile.read(audio_source)
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
text = asr_pipe(data)["text"]
else:
return state, state, None # No audio input
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
You answer questions clearly and simply, using age-appropriate language.
You are also a little bit silly and like to make jokes."""
prompt = f"{system_prompt}\nUser: {text}"
if img is not None:
sd_image_a = Image.open(img.name).convert('RGB')
inputsa = processor5(images=sd_image_a, text=cap_prompt, return_tensors="pt").to(torch.device('cuda'))
sd_image_a.resize((512,512), Image.LANCZOS)
if answer_mode == 'slow':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")
if answer_mode == 'medium':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
torch.set_float32_matmul_precision("high")
if answer_mode == 'fast':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# torch.backends.cuda.preferred_blas_library="cublas"
# torch.backends.cuda.preferred_linalg_library="cusolver"
torch.set_float32_matmul_precision("medium")
with torch.no_grad():
generated_ids = model5.generate(
**inputsa,
do_sample=True,
num_beams=1,
max_length=96,
min_length=48,
top_p=0.9,
repetition_penalty=1.0,
length_penalty=2.0,
temperature=0.5,
)
generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)
prompt = f"{system_prompt}\nImage: {generated_text}\nUser: {text}"
with torch.no_grad():
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(torch.device('cuda'))
if answer_mode == 'slow':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_new_tokens = 512,
min_new_tokens = 256,
do_sample = True,
low_memory = False
)
'''
vicuna_output = llm(
**vicuna_input,
max_tokens=96, # Generate up to 32 tokens, set to None to generate up to the end of the context window
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True # Echo the prompt back in the output
)
'''
if answer_mode == 'medium':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
torch.set_float32_matmul_precision("high")
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_length = 192,
min_new_tokens = 64,
do_sample = True,
low_memory = False
)
if answer_mode == 'fast':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# torch.backends.cuda.preferred_blas_library="cublas"
# torch.backends.cuda.preferred_linalg_library="cusolver"
torch.set_float32_matmul_precision("medium")
with torch.no_grad():
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_new_tokens = 96,
min_new_tokens = 16,
do_sample = True,
low_memory = True
)
vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
vicuna_response = vicuna_response.replace(prompt, "").strip()
updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
try:
with torch.no_grad():
output = tts(vicuna_response)
wav = output["wav"]
sr = tts.fs
audio_arr = wav.cpu().numpy()
SAMPLE_RATE = sr
audio_arr = audio_arr / np.abs(audio_arr).max()
audio_output = (SAMPLE_RATE, audio_arr)
#sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) # Removed writing to file
except requests.exceptions.RequestException as e:
print(f"Error in Hugging Face API request: {e}")
audio_output = None
except Exception as e:
print(f"Error in speech synthesis: {e}")
audio_output = None
return updated_state, updated_state, audio_output
with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
with gr.Tab("Transcribe & Synthesize"):
with gr.Row(): # Added a row for better layout
image = gr.File(label="Image Prompt (Optional)")
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here", elem_id="mic_audio")
audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
transcription_state = gr.State(value="")
mic_input.change(
fn=process_audio,
inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
outputs=[transcription_output, transcription_state, audio_output]
)
audio_upload.change( # Added change event for upload
fn=process_audio,
inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
outputs=[transcription_output, transcription_state, audio_output],
api_name='/api/predict'
)
if __name__ == '__main__':
demo.launch(share=False)