import spaces import gradio as gr import torch import os from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor from gtts import gTTS from langdetect import detect import subprocess from io import BytesIO # Install flash-attn subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) # Disable CUDA initialization at import os.environ['CUDA_VISIBLE_DEVICES'] = '' torch.set_grad_enabled(False) print("CUDA initialization disabled at import") @spaces.GPU def load_whisper(): try: processor = WhisperProcessor.from_pretrained("openai/whisper-small") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") return processor, model except Exception as e: print(f"Error loading Whisper model: {e}") return None, None @spaces.GPU def load_vision_model(): try: model_id = "microsoft/Phi-3.5-vision-instruct" model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.float16 ) processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16) return model, processor except Exception as e: print(f"Error loading vision model: {e}") return None, None @spaces.GPU def load_sarvam(): try: return pipeline('sarvamai/sarvam-2b-v0.5') except Exception as e: print(f"Error loading Sarvam model: {e}") return None @spaces.GPU def process_audio(audio_path, whisper_processor, whisper_model): import librosa try: audio, sr = librosa.load(audio_path, sr=16000) input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features predicted_ids = whisper_model.generate(input_features) transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription except Exception as e: return f"Error processing audio: {str(e)}" @spaces.GPU def process_image(image, text_prompt, vision_model, vision_processor): try: messages = [{"role": "user", "content": f"{text_prompt}\n<|image_1|>"}] prompt = vision_processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = vision_processor(prompt, image, return_tensors="pt") generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] return response except Exception as e: return f"Error processing image: {str(e)}" @spaces.GPU def generate_response(transcription, sarvam_pipe): try: response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text'] return response except Exception as e: return f"Error generating response: {str(e)}" def text_to_speech(text, lang='hi'): try: tts = gTTS(text=text, lang=lang, tld='co.in') tts.save("response.mp3") return "response.mp3" except Exception as e: print(f"Error in text-to-speech: {str(e)}") return None @spaces.GPU def indic_vision_assistant(input_type, audio_input, text_input, image_input): try: whisper_processor, whisper_model = load_whisper() vision_model, vision_processor = load_vision_model() sarvam_pipe = load_sarvam() if input_type == "audio" and audio_input is not None: transcription = process_audio(audio_input, whisper_processor, whisper_model) elif input_type == "text" and text_input: transcription = text_input elif input_type == "image" and image_input is not None: text_prompt = text_input if text_input else "Describe this image in detail." transcription = process_image(image_input, text_prompt, vision_model, vision_processor) else: return "Please provide either audio, text, or image input.", "No input provided.", None response = generate_response(transcription, sarvam_pipe) lang = detect(response) audio_response = text_to_speech(response, lang) return transcription, response, audio_response except Exception as e: error_message = f"An error occurred: {str(e)}" return error_message, error_message, None # Custom CSS custom_css = """ body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif; } #custom-header { text-align: center; padding: 20px 0; background-color: #1a202c; margin-bottom: 20px; border-radius: 10px; } #custom-header h1 { font-size: 2.5rem; margin-bottom: 0.5rem; } #custom-header h1 .blue { color: #60a5fa; } #custom-header h1 .pink { color: #f472b6; } #custom-header h2 { font-size: 1.5rem; color: #94a3b8; } .suggestions { display: flex; justify-content: center; flex-wrap: wrap; gap: 1rem; margin: 20px 0; } .suggestion { background-color: #1e293b; border-radius: 0.5rem; padding: 1rem; display: flex; align-items: center; transition: transform 0.3s ease; width: 200px; } .suggestion:hover { transform: translateY(-5px); } .suggestion-icon { font-size: 1.5rem; margin-right: 1rem; background-color: #2d3748; padding: 0.5rem; border-radius: 50%; } .gradio-container { max-width: 100% !important; } #component-0, #component-1, #component-2 { max-width: 100% !important; } footer { text-align: center; margin-top: 2rem; color: #64748b; } """ # Custom HTML for the header custom_header = """
Speak in any Indic language
Type in any Indic language
Upload an image for analysis
Get AI-generated responses
Listen to audio responses