import gradio as gr from transformers import AutoProcessor, LlavaForConditionalGeneration from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, set_global_service_context from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.vector_stores.faiss import FaissVectorStore from llama_index.storage.storage_context import StorageContext import torch from PIL import Image import os # Load LLaVA model and processor model_id = "llava-hf/llava-1.5-7b-hf" processor = AutoProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True) model.to("cuda" if torch.cuda.is_available() else "cpu") # Load documents and build FAISS index documents = SimpleDirectoryReader("docs").load_data() embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en") service_context = ServiceContext.from_defaults(embed_model=embed_model) set_global_service_context(service_context) index = VectorStoreIndex.from_documents(documents, service_context=service_context) query_engine = index.as_query_engine() def multimodal_rag(image, question): # Step 1: RAG to retrieve context context = query_engine.query(question) # Step 2: Process with LLaVA prompt = f"Context: {context} Question: {question}" inputs = processor(prompt, image, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=100) answer = processor.decode(output[0], skip_special_tokens=True) return answer demo = gr.Interface( fn=multimodal_rag, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Enter your question") ], outputs="text", title="Multimodal RAG with LLaVA and FAISS", description="Upload an image and ask a question. The system retrieves relevant text using FAISS and answers using LLaVA." ) if __name__ == "__main__": demo.launch()