File size: 3,389 Bytes
54202cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
import fitz # PyMuPDF for PDF processing
import faiss
import numpy as np
import pickle
import streamlit as st
from sentence_transformers import SentenceTransformer
from groq import Groq
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Initialize Groq Client
client = Groq(api_key="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")
# Load Sentence Transformer model for embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Initialize FAISS index
INDEX_FILE = "faiss_index.pkl"
def load_faiss_index():
if os.path.exists(INDEX_FILE):
with open(INDEX_FILE, "rb") as f:
return pickle.load(f)
return faiss.IndexFlatL2(384)
index = load_faiss_index()
documents = []
def extract_text_from_pdf(pdf_file):
doc = fitz.open(pdf_file)
return "\n".join([page.get_text() for page in doc])
def chunk_text(text, chunk_size=500, overlap=100):
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
def add_to_faiss(text_chunks):
global index, documents
embeddings = embedding_model.encode(text_chunks)
index.add(np.array(embeddings, dtype=np.float32))
documents.extend(text_chunks)
with open(INDEX_FILE, "wb") as f:
pickle.dump(index, f)
def query_faiss(query, top_k=3):
query_embedding = embedding_model.encode([query])
_, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
return [documents[i] for i in indices[0] if i < len(documents)]
def query_groq(prompt):
try:
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama-3.3-70b-versatile"
)
return chat_completion.choices[0].message.content
except Exception as e:
return f"β οΈ Error: {str(e)}"
# Streamlit UI
st.set_page_config(page_title="RAG-based PDF Chatbot", page_icon="π", layout="wide")
st.title("π RAG-based PDF Chatbot")
st.markdown("Talk to your PDFs using AI-powered search!")
with st.sidebar:
st.subheader("π€ Upload a PDF")
uploaded_file = st.file_uploader("Drag & drop or browse", type="pdf")
if uploaded_file:
with st.spinner("Processing your PDF..."):
with open("uploaded.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
text = extract_text_from_pdf("uploaded.pdf")
text_chunks = chunk_text(text)
add_to_faiss(text_chunks)
st.sidebar.success("β
PDF uploaded and indexed!")
with st.expander("π Extracted Text Preview", expanded=False):
st.text(text[:1000] + "...")
st.markdown("---")
st.subheader("π Ask something about the document")
query = st.text_input("Type your question below:")
if query:
retrieved_texts = query_faiss(query)
if retrieved_texts:
context = "\n".join(retrieved_texts)
with st.expander("π Retrieved Context", expanded=False):
st.text(context[:1000] + "...")
response = query_groq(f"Context:\n{context}\n\nUser Query:\n{query}")
st.subheader("π¬ AI Response")
st.markdown(f"**{response}**")
else:
st.warning("β οΈ No relevant context found in the document!") |