import PyPDF2 from sentence_transformers import SentenceTransformer import faiss import gradio as gr import openai # Ensure your OpenAI API key is correct openai.api_key = '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' # Function to extract text from PDF def extract_text_from_pdf(pdf_path): text = [] with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) for i, page in enumerate(reader.pages): page_text = page.extract_text() text.append({"page": i + 1, "text": page_text}) return text # Function to chunk text into manageable pieces def chunk_text(text_data, chunk_size=2000): chunks = [] for data in text_data: page_text = data["text"] page_num = data["page"] sentences = page_text.split(". ") current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) <= chunk_size: current_chunk += sentence + ". " else: chunks.append({"chunk": current_chunk.strip(), "page": page_num}) current_chunk = sentence + ". " if current_chunk: chunks.append({"chunk": current_chunk.strip(), "page": page_num}) return chunks # Function to create a FAISS index from the chunked text def create_faiss_index(chunks, model_name="all-MiniLM-L6-v2"): model = SentenceTransformer(model_name) embeddings = model.encode([chunk["chunk"] for chunk in chunks]) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) return index, chunks # Function to retrieve relevant chunks from the FAISS index based on a question def retrieve_from_pdf(question, index, chunks, model_name="all-MiniLM-L6-v2"): model = SentenceTransformer(model_name) query_embedding = model.encode([question]) _, top_k_indices = index.search(query_embedding, k=10) # Retrieve top 10 results retrieved_chunks = [chunks[idx] for idx in top_k_indices[0]] # Debug: Print the retrieved chunks print("Retrieved Chunks:") for i, chunk in enumerate(retrieved_chunks): print(f"Chunk {i + 1}: {chunk['chunk'][:200]}... (Page {chunk['page']})") # Truncate to first 200 chars page_numbers = set(chunk["page"] for chunk in retrieved_chunks) print(f"Retrieved page numbers: {page_numbers}") # Debug: Page numbers filtered_chunks = [chunk for chunk in retrieved_chunks if chunk["page"] in page_numbers] context = " ".join([chunk["chunk"] for chunk in filtered_chunks]) return filtered_chunks, context # GPT function to generate a precise answer using the retrieved context # GPT function to generate a precise answer using the retrieved context def gpt_generate_answer(question, context, pages): pages_text = ", ".join(map(str, set(pages))) prompt = ( f"Answer the following question as precisely and concisely as possible based on the provided context. " f"Also include the page numbers where the relevant text was found. Please respond in English:\n\n" f"Question: {question}\n\n" f"Context: {context}\n\n" f"Pages: {pages_text}\n\n" f"Please strictly follow this format:\n" f"- **Answer:** [Your answer]\n" f"- **Relevant Text:** [The most relevant portion of the context]\n" f"- **Pages:** [Pages of the Relevant Text]\n" ) # Debug: Print the prompt to OpenAI print("GPT Prompt:", prompt) # Call OpenAI API response = openai.ChatCompletion.create( model="o1-mini", messages=[{"role": "user", "content": prompt}] ) # Debug: Print the entire response from GPT print("GPT Raw Response:", response) content = response['choices'][0]['message']['content'] print("GPT Content:", content) # Debug # Parse the GPT response return parse_gpt_response(content) # Function to parse the GPT response def parse_gpt_response(content): answer, relevant_text, relevant_pages = None, None, None # Check and parse for English labels if "Answer:" in content: answer = content.split("- **Answer:**")[1].split("- **Relevant Text:**")[0].strip() if "Relevant Text:" in content: relevant_text = content.split("- **Relevant Text:**")[1].split("- **Pages:**")[0].strip() if "Pages:" in content: relevant_pages = content.split("- **Pages:**")[1].strip() # Ensure missing information is handled if not answer: print("Warning: 'Answer' was not parsed correctly.") answer = "Answer not found." if not relevant_text: print("Warning: 'Relevant Text' was not parsed correctly.") relevant_text = "Relevant Text not found." if not relevant_pages: print("Warning: 'Pages' was not parsed correctly.") relevant_pages = "Pages not found." # Debug: Print parsed content print("Parsed Answer:", answer) # Debug print("Parsed Relevant Text:", relevant_text) # Debug print("Parsed Relevant Pages:", relevant_pages) # Debug return answer, relevant_text, relevant_pages # Gradio function to integrate everything into an interactive interface def gradio_rag(question): pdf_path = "norms_pacing.pdf" text_data = extract_text_from_pdf(pdf_path) chunks = chunk_text(text_data) index, chunk_list = create_faiss_index(chunks) retrieved_chunks, context = retrieve_from_pdf(question, index, chunk_list) if not context.strip(): answer = "No relevant information found." relevant_text = "No relevant text found." relevant_pages = "No pages found." else: pages = [chunk["page"] for chunk in retrieved_chunks] # Extract relevant pages answer, relevant_text, relevant_pages = gpt_generate_answer(question, context, pages) print("Final Answer:", answer) # Debug print("Final Relevant Text:", relevant_text) # Debug print("Final Relevant Pages:", relevant_pages) # Debug return answer, relevant_text, relevant_pages # Gradio interface interface = gr.Interface( fn=gradio_rag, inputs=gr.Textbox(label="Enter your question"), outputs=[ gr.Textbox(label="Answer"), gr.Textbox(label="Relevant Retrieved Text"), gr.Textbox(label="Pages Retrieved") ], title="RAG PDF Q&A with GPT", description="Ask a question, and the system retrieves relevant information from a PDF file and generates a refined answer using GPT.", ) # Launch the interface if __name__ == "__main__": interface.launch(share=True)