Spaces:

zliang
/

PDFReadingAssistant

Paused

App Files Files Community

zliang commited on Feb 9

Commit

e60ba77

verified ·

1 Parent(s): 18ee818

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -0

app.py CHANGED Viewed

@@ -235,6 +235,80 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
 # (Keep the rest of the code from previous implementation for PDF processing and UI)
 # [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
 # [Make sure to maintain all the UI improvements and error handling]
 # Streamlit UI Configuration
 st.set_page_config(

 # (Keep the rest of the code from previous implementation for PDF processing and UI)
 # [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
 # [Make sure to maintain all the UI improvements and error handling]
+@st.cache_data(show_spinner=False, ttl=3600)
+@handle_errors
+def qa_pdf(_pdf_file_path, query, num_clusters=5):
+    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
+    llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
+    prompt = ChatPromptTemplate.from_template(
+        """Answer this question: {question}
+        Using only this context: {context}
+        Format your answer with:
+        - Clear section headings
+        - Bullet points for lists
+        - Bold key terms
+        - Citations from the text"""
+    )
+    loader = PyMuPDFLoader(_pdf_file_path)
+    docs = loader.load()
+    full_text = "\n".join(doc.page_content for doc in docs)
+    cleaned_full_text = clean_text(remove_references(full_text))
+    text_splitter = SpacyTextSplitter(chunk_size=500)
+    split_contents = text_splitter.split_text(cleaned_full_text)
+    query_embedding = embeddings_model.embed_query(query)
+    similarities = cosine_similarity([query_embedding],
+                                   embeddings_model.embed_documents(split_contents))[0]
+    top_indices = np.argsort(similarities)[-num_clusters:]
+    chain = prompt | llm | StrOutputParser()
+    return chain.invoke({
+        "question": query,
+        "context": ' '.join([split_contents[i] for i in top_indices])
+    })
+@st.cache_data(show_spinner=False, ttl=3600)
+@handle_errors
+def process_pdf(_pdf_file_path):
+    doc = fitz.open(_pdf_file_path)
+    all_figures, all_tables = [], []
+    scale_factor = 300 / 50  # High-res to low-res ratio
+    for page in doc:
+        low_res = page.get_pixmap(dpi=50)
+        low_res_img = np.frombuffer(low_res.samples, dtype=np.uint8).reshape(low_res.height, low_res.width, 3)
+        results = model.predict(low_res_img)
+        boxes = [
+            (int(box.xyxy[0][0]), int(box.xyxy[0][1]),
+             int(box.xyxy[0][2]), int(box.xyxy[0][3]), int(box.cls[0]))
+            for result in results for box in result.boxes
+            if box.conf[0] > 0.8 and int(box.cls[0]) in {3, 4}
+        ]
+        if boxes:
+            high_res = page.get_pixmap(dpi=300)
+            high_res_img = np.frombuffer(high_res.samples, dtype=np.uint8).reshape(high_res.height, high_res.width, 3)
+            for (x1, y1, x2, y2, cls) in boxes:
+                cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
+                                     int(x1*scale_factor):int(x2*scale_factor)]
+                if cls == 4:
+                    all_figures.append(cropped)
+                else:
+                    all_tables.append(cropped)
+    return all_figures, all_tables
+def image_to_base64(img):
+    buffered = io.BytesIO()
+    img = Image.fromarray(img).convert("RGB")
+    img.thumbnail((800, 800))  # Optimize image size
+    img.save(buffered, format="JPEG", quality=85)
+    return base64.b64encode(buffered.getvalue()).decode()
 # Streamlit UI Configuration
 st.set_page_config(