File size: 6,596 Bytes
4e68a6e c83125c 4e68a6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
import openai
# Ensure your OpenAI API key is correct
openai.api_key = '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'
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
text = []
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
for i, page in enumerate(reader.pages):
page_text = page.extract_text()
text.append({"page": i + 1, "text": page_text})
return text
# Function to chunk text into manageable pieces
def chunk_text(text_data, chunk_size=2000):
chunks = []
for data in text_data:
page_text = data["text"]
page_num = data["page"]
sentences = page_text.split(". ")
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= chunk_size:
current_chunk += sentence + ". "
else:
chunks.append({"chunk": current_chunk.strip(), "page": page_num})
current_chunk = sentence + ". "
if current_chunk:
chunks.append({"chunk": current_chunk.strip(), "page": page_num})
return chunks
# Function to create a FAISS index from the chunked text
def create_faiss_index(chunks, model_name="all-MiniLM-L6-v2"):
model = SentenceTransformer(model_name)
embeddings = model.encode([chunk["chunk"] for chunk in chunks])
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index, chunks
# Function to retrieve relevant chunks from the FAISS index based on a question
def retrieve_from_pdf(question, index, chunks, model_name="all-MiniLM-L6-v2"):
model = SentenceTransformer(model_name)
query_embedding = model.encode([question])
_, top_k_indices = index.search(query_embedding, k=10) # Retrieve top 10 results
retrieved_chunks = [chunks[idx] for idx in top_k_indices[0]]
# Debug: Print the retrieved chunks
print("Retrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks):
print(f"Chunk {i + 1}: {chunk['chunk'][:200]}... (Page {chunk['page']})") # Truncate to first 200 chars
page_numbers = set(chunk["page"] for chunk in retrieved_chunks)
print(f"Retrieved page numbers: {page_numbers}") # Debug: Page numbers
filtered_chunks = [chunk for chunk in retrieved_chunks if chunk["page"] in page_numbers]
context = " ".join([chunk["chunk"] for chunk in filtered_chunks])
return filtered_chunks, context
# GPT function to generate a precise answer using the retrieved context
# GPT function to generate a precise answer using the retrieved context
def gpt_generate_answer(question, context, pages):
pages_text = ", ".join(map(str, set(pages)))
prompt = (
f"Answer the following question as precisely and concisely as possible based on the provided context. "
f"Also include the page numbers where the relevant text was found. Please respond in English:\n\n"
f"Question: {question}\n\n"
f"Context: {context}\n\n"
f"Pages: {pages_text}\n\n"
f"Please strictly follow this format:\n"
f"- **Answer:** [Your answer]\n"
f"- **Relevant Text:** [The most relevant portion of the context]\n"
f"- **Pages:** [Pages of the Relevant Text]\n"
)
# Debug: Print the prompt to OpenAI
print("GPT Prompt:", prompt)
# Call OpenAI API
response = openai.ChatCompletion.create(
model="o1-mini",
messages=[{"role": "user", "content": prompt}]
)
# Debug: Print the entire response from GPT
print("GPT Raw Response:", response)
content = response['choices'][0]['message']['content']
print("GPT Content:", content) # Debug
# Parse the GPT response
return parse_gpt_response(content)
# Function to parse the GPT response
def parse_gpt_response(content):
answer, relevant_text, relevant_pages = None, None, None
# Check and parse for English labels
if "Answer:" in content:
answer = content.split("- **Answer:**")[1].split("- **Relevant Text:**")[0].strip()
if "Relevant Text:" in content:
relevant_text = content.split("- **Relevant Text:**")[1].split("- **Pages:**")[0].strip()
if "Pages:" in content:
relevant_pages = content.split("- **Pages:**")[1].strip()
# Ensure missing information is handled
if not answer:
print("Warning: 'Answer' was not parsed correctly.")
answer = "Answer not found."
if not relevant_text:
print("Warning: 'Relevant Text' was not parsed correctly.")
relevant_text = "Relevant Text not found."
if not relevant_pages:
print("Warning: 'Pages' was not parsed correctly.")
relevant_pages = "Pages not found."
# Debug: Print parsed content
print("Parsed Answer:", answer) # Debug
print("Parsed Relevant Text:", relevant_text) # Debug
print("Parsed Relevant Pages:", relevant_pages) # Debug
return answer, relevant_text, relevant_pages
# Gradio function to integrate everything into an interactive interface
def gradio_rag(question):
pdf_path = "norms_pacing.pdf"
text_data = extract_text_from_pdf(pdf_path)
chunks = chunk_text(text_data)
index, chunk_list = create_faiss_index(chunks)
retrieved_chunks, context = retrieve_from_pdf(question, index, chunk_list)
if not context.strip():
answer = "No relevant information found."
relevant_text = "No relevant text found."
relevant_pages = "No pages found."
else:
pages = [chunk["page"] for chunk in retrieved_chunks] # Extract relevant pages
answer, relevant_text, relevant_pages = gpt_generate_answer(question, context, pages)
print("Final Answer:", answer) # Debug
print("Final Relevant Text:", relevant_text) # Debug
print("Final Relevant Pages:", relevant_pages) # Debug
return answer, relevant_text, relevant_pages
# Gradio interface
interface = gr.Interface(
fn=gradio_rag,
inputs=gr.Textbox(label="Enter your question"),
outputs=[
gr.Textbox(label="Answer"),
gr.Textbox(label="Relevant Retrieved Text"),
gr.Textbox(label="Pages Retrieved")
],
title="RAG PDF Q&A with GPT",
description="Ask a question, and the system retrieves relevant information from a PDF file and generates a refined answer using GPT.",
)
# Launch the interface
if __name__ == "__main__":
interface.launch(share=True) |