|
import streamlit as st |
|
import faiss |
|
import numpy as np |
|
import pickle |
|
from sentence_transformers import SentenceTransformer |
|
from transformers import pipeline |
|
|
|
|
|
INDEX_FILE = "faiss_index.index" |
|
CHUNKS_FILE = "chunks.pkl" |
|
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" |
|
QA_MODEL_NAME = "deepset/roberta-large-squad2" |
|
|
|
@st.cache_resource |
|
def load_index_and_chunks(): |
|
index = faiss.read_index(INDEX_FILE) |
|
with open(CHUNKS_FILE, "rb") as f: |
|
chunks = pickle.load(f) |
|
return index, chunks |
|
|
|
@st.cache_resource |
|
def load_embedding_model(): |
|
return SentenceTransformer(EMBEDDING_MODEL_NAME) |
|
|
|
@st.cache_resource |
|
def load_qa_pipeline(): |
|
|
|
return pipeline("question-answering", model=QA_MODEL_NAME, tokenizer=QA_MODEL_NAME) |
|
|
|
def main(): |
|
st.title("PDF Question-Answering App") |
|
|
|
|
|
index, chunks = load_index_and_chunks() |
|
embed_model = load_embedding_model() |
|
qa_pipeline = load_qa_pipeline() |
|
|
|
st.write("Enter your question about the PDF document:") |
|
query = st.text_input("Question:") |
|
|
|
if query: |
|
|
|
query_embedding = embed_model.encode([query]).astype('float32') |
|
|
|
|
|
k = 3 |
|
distances, indices = index.search(query_embedding, k) |
|
|
|
|
|
context = "" |
|
for idx in indices[0]: |
|
context_piece = chunks[idx] |
|
context += context_piece + " " |
|
|
|
|
|
with st.expander("Show Retrieved Context"): |
|
for idx in indices[0]: |
|
st.write(chunks[idx]) |
|
|
|
st.subheader("Answer:") |
|
try: |
|
|
|
result = qa_pipeline(question=query, context=context) |
|
st.write(result["answer"]) |
|
except Exception as e: |
|
st.error(f"Error generating answer: {e}") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|