import spaces import gradio as gr import torch import os from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor from gtts import gTTS from langdetect import detect import subprocess from io import BytesIO # Install flash-attn subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) # Disable CUDA initialization at import os.environ['CUDA_VISIBLE_DEVICES'] = '' torch.set_grad_enabled(False) print("CUDA initialization disabled at import") @spaces.GPU def load_whisper(): try: processor = WhisperProcessor.from_pretrained("openai/whisper-small") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") return processor, model except Exception as e: print(f"Error loading Whisper model: {e}") return None, None @spaces.GPU def load_vision_model(): try: model_id = "microsoft/Phi-3.5-vision-instruct" model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.float16 ) processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16) return model, processor except Exception as e: print(f"Error loading vision model: {e}") return None, None @spaces.GPU def load_sarvam(): try: return pipeline('sarvamai/sarvam-2b-v0.5') except Exception as e: print(f"Error loading Sarvam model: {e}") return None @spaces.GPU def process_audio(audio_path, whisper_processor, whisper_model): import librosa try: audio, sr = librosa.load(audio_path, sr=16000) input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features predicted_ids = whisper_model.generate(input_features) transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription except Exception as e: return f"Error processing audio: {str(e)}" @spaces.GPU def process_image(image, text_prompt, vision_model, vision_processor): try: messages = [{"role": "user", "content": f"{text_prompt}\n<|image_1|>"}] prompt = vision_processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = vision_processor(prompt, image, return_tensors="pt") generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] return response except Exception as e: return f"Error processing image: {str(e)}" @spaces.GPU def generate_response(transcription, sarvam_pipe): try: response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text'] return response except Exception as e: return f"Error generating response: {str(e)}" def text_to_speech(text, lang='hi'): try: tts = gTTS(text=text, lang=lang, tld='co.in') tts.save("response.mp3") return "response.mp3" except Exception as e: print(f"Error in text-to-speech: {str(e)}") return None @spaces.GPU def indic_vision_assistant(input_type, audio_input, text_input, image_input): try: whisper_processor, whisper_model = load_whisper() vision_model, vision_processor = load_vision_model() sarvam_pipe = load_sarvam() if input_type == "audio" and audio_input is not None: transcription = process_audio(audio_input, whisper_processor, whisper_model) elif input_type == "text" and text_input: transcription = text_input elif input_type == "image" and image_input is not None: text_prompt = text_input if text_input else "Describe this image in detail." transcription = process_image(image_input, text_prompt, vision_model, vision_processor) else: return "Please provide either audio, text, or image input.", "No input provided.", None response = generate_response(transcription, sarvam_pipe) lang = detect(response) audio_response = text_to_speech(response, lang) return transcription, response, audio_response except Exception as e: error_message = f"An error occurred: {str(e)}" return error_message, error_message, None # Custom CSS custom_css = """ body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif; } #custom-header { text-align: center; padding: 20px 0; background-color: #1a202c; margin-bottom: 20px; border-radius: 10px; } #custom-header h1 { font-size: 2.5rem; margin-bottom: 0.5rem; } #custom-header h1 .blue { color: #60a5fa; } #custom-header h1 .pink { color: #f472b6; } #custom-header h2 { font-size: 1.5rem; color: #94a3b8; } .suggestions { display: flex; justify-content: center; flex-wrap: wrap; gap: 1rem; margin: 20px 0; } .suggestion { background-color: #1e293b; border-radius: 0.5rem; padding: 1rem; display: flex; align-items: center; transition: transform 0.3s ease; width: 200px; } .suggestion:hover { transform: translateY(-5px); } .suggestion-icon { font-size: 1.5rem; margin-right: 1rem; background-color: #2d3748; padding: 0.5rem; border-radius: 50%; } .gradio-container { max-width: 100% !important; } #component-0, #component-1, #component-2 { max-width: 100% !important; } footer { text-align: center; margin-top: 2rem; color: #64748b; } """ # Custom HTML for the header custom_header = """

Hello, User

How can I help you today?

""" # Custom HTML for suggestions custom_suggestions = """
🎤

Speak in any Indic language

⌨️

Type in any Indic language

🖼️

Upload an image for analysis

🤖

Get AI-generated responses

🔊

Listen to audio responses

""" # Gradio interface with gr.Blocks(css=custom_css, theme=gr.themes.Base().set( body_background_fill="#0b0f19", body_text_color="#e2e8f0", button_primary_background_fill="#3b82f6", button_primary_background_fill_hover="#2563eb", button_primary_text_color="white", block_title_text_color="#94a3b8", block_label_text_color="#94a3b8", )) as iface: gr.HTML(custom_header) gr.HTML(custom_suggestions) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Indic Vision Assistant") input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio") audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)") text_input = gr.Textbox(label="Type your message or image prompt") image_input = gr.Image(type="pil", label="Upload an image (if image input selected)") submit_btn = gr.Button("Submit") output_transcription = gr.Textbox(label="Transcription/Input") output_response = gr.Textbox(label="Generated Response") output_audio = gr.Audio(label="Audio Response") submit_btn.click( fn=indic_vision_assistant, inputs=[input_type, audio_input, text_input, image_input], outputs=[output_transcription, output_response, output_audio] ) gr.HTML("") # Launch the app iface.launch()