import gradio as gr from datasets import load_dataset from sentence_transformers import SentenceTransformer import faiss import numpy as np from transformers import pipeline # Load dataset dataset = load_dataset("lex_glue", "scotus") corpus = [doc['text'] for doc in dataset['train'].select(range(200))] # just 200 to keep it light # Embedding model embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') corpus_embeddings = embedder.encode(corpus, convert_to_numpy=True) # Build FAISS index dimension = corpus_embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(corpus_embeddings) # Text generation model gen_pipeline = pipeline("text2text-generation", model="facebook/bart-large-cnn") # RAG-like query function def rag_query(user_question): question_embedding = embedder.encode([user_question]) _, indices = index.search(np.array(question_embedding), k=3) context = " ".join([corpus[i] for i in indices[0]]) prompt = f"Question: {user_question}\nContext: {context}\nAnswer:" result = gen_pipeline(prompt, max_length=250, do_sample=False)[0]['generated_text'] return result # Gradio UI def chatbot_interface(query): return rag_query(query) iface = gr.Interface(fn=chatbot_interface, inputs="text", outputs="text", title="🧑‍⚖️ Legal Assistant Chatbot", description="Ask legal questions based on case data (LexGLUE - SCOTUS subset)") iface.launch()