File size: 6,596 Bytes
4e68a6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c83125c
4e68a6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
import openai

# Ensure your OpenAI API key is correct
openai.api_key = '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'

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = []
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for i, page in enumerate(reader.pages):
            page_text = page.extract_text()
            text.append({"page": i + 1, "text": page_text})
    return text

# Function to chunk text into manageable pieces
def chunk_text(text_data, chunk_size=2000):
    chunks = []
    for data in text_data:
        page_text = data["text"]
        page_num = data["page"]
        sentences = page_text.split(". ")
        current_chunk = ""
        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= chunk_size:
                current_chunk += sentence + ". "
            else:
                chunks.append({"chunk": current_chunk.strip(), "page": page_num})
                current_chunk = sentence + ". "
        if current_chunk:
            chunks.append({"chunk": current_chunk.strip(), "page": page_num})
    return chunks

# Function to create a FAISS index from the chunked text
def create_faiss_index(chunks, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode([chunk["chunk"] for chunk in chunks])
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index, chunks

# Function to retrieve relevant chunks from the FAISS index based on a question
def retrieve_from_pdf(question, index, chunks, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    query_embedding = model.encode([question])
    _, top_k_indices = index.search(query_embedding, k=10)  # Retrieve top 10 results

    retrieved_chunks = [chunks[idx] for idx in top_k_indices[0]]
    
    # Debug: Print the retrieved chunks
    print("Retrieved Chunks:")
    for i, chunk in enumerate(retrieved_chunks):
        print(f"Chunk {i + 1}: {chunk['chunk'][:200]}... (Page {chunk['page']})")  # Truncate to first 200 chars

    page_numbers = set(chunk["page"] for chunk in retrieved_chunks)
    print(f"Retrieved page numbers: {page_numbers}")  # Debug: Page numbers

    filtered_chunks = [chunk for chunk in retrieved_chunks if chunk["page"] in page_numbers]

    context = " ".join([chunk["chunk"] for chunk in filtered_chunks])
    
    return filtered_chunks, context

# GPT function to generate a precise answer using the retrieved context
# GPT function to generate a precise answer using the retrieved context
def gpt_generate_answer(question, context, pages):
    pages_text = ", ".join(map(str, set(pages)))

    prompt = (
        f"Answer the following question as precisely and concisely as possible based on the provided context. "
        f"Also include the page numbers where the relevant text was found. Please respond in English:\n\n"
        f"Question: {question}\n\n"
        f"Context: {context}\n\n"
        f"Pages: {pages_text}\n\n"
        f"Please strictly follow this format:\n"
        f"- **Answer:** [Your answer]\n"
        f"- **Relevant Text:** [The most relevant portion of the context]\n"
        f"- **Pages:** [Pages of the Relevant Text]\n"
    )

    # Debug: Print the prompt to OpenAI
    print("GPT Prompt:", prompt)

    # Call OpenAI API
    response = openai.ChatCompletion.create(
        model="o1-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    # Debug: Print the entire response from GPT
    print("GPT Raw Response:", response)

    content = response['choices'][0]['message']['content']
    print("GPT Content:", content)  # Debug

    # Parse the GPT response
    return parse_gpt_response(content)


# Function to parse the GPT response
def parse_gpt_response(content):
    answer, relevant_text, relevant_pages = None, None, None

    # Check and parse for English labels
    if "Answer:" in content:
        answer = content.split("- **Answer:**")[1].split("- **Relevant Text:**")[0].strip()
    if "Relevant Text:" in content:
        relevant_text = content.split("- **Relevant Text:**")[1].split("- **Pages:**")[0].strip()
    if "Pages:" in content:
        relevant_pages = content.split("- **Pages:**")[1].strip()

    # Ensure missing information is handled
    if not answer:
        print("Warning: 'Answer' was not parsed correctly.")
        answer = "Answer not found."
    if not relevant_text:
        print("Warning: 'Relevant Text' was not parsed correctly.")
        relevant_text = "Relevant Text not found."
    if not relevant_pages:
        print("Warning: 'Pages' was not parsed correctly.")
        relevant_pages = "Pages not found."

    # Debug: Print parsed content
    print("Parsed Answer:", answer)  # Debug
    print("Parsed Relevant Text:", relevant_text)  # Debug
    print("Parsed Relevant Pages:", relevant_pages)  # Debug

    return answer, relevant_text, relevant_pages

# Gradio function to integrate everything into an interactive interface
def gradio_rag(question):
    pdf_path = "norms_pacing.pdf"
    text_data = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(text_data)
    index, chunk_list = create_faiss_index(chunks)
    
    retrieved_chunks, context = retrieve_from_pdf(question, index, chunk_list)
    
    if not context.strip():
        answer = "No relevant information found."
        relevant_text = "No relevant text found."
        relevant_pages = "No pages found."
    else:
        pages = [chunk["page"] for chunk in retrieved_chunks]  # Extract relevant pages
        answer, relevant_text, relevant_pages = gpt_generate_answer(question, context, pages)
        print("Final Answer:", answer)  # Debug
        print("Final Relevant Text:", relevant_text)  # Debug
        print("Final Relevant Pages:", relevant_pages)  # Debug
    
    return answer, relevant_text, relevant_pages

# Gradio interface
interface = gr.Interface(
    fn=gradio_rag,
    inputs=gr.Textbox(label="Enter your question"),
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="Relevant Retrieved Text"),
        gr.Textbox(label="Pages Retrieved")
    ],
    title="RAG PDF Q&A with GPT",
    description="Ask a question, and the system retrieves relevant information from a PDF file and generates a refined answer using GPT.",
)

# Launch the interface
if __name__ == "__main__":
    interface.launch(share=True)