zliang commited on
Commit
e60ba77
·
verified ·
1 Parent(s): 18ee818

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py CHANGED
@@ -235,6 +235,80 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
235
  # (Keep the rest of the code from previous implementation for PDF processing and UI)
236
  # [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
237
  # [Make sure to maintain all the UI improvements and error handling]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  # Streamlit UI Configuration
240
  st.set_page_config(
 
235
  # (Keep the rest of the code from previous implementation for PDF processing and UI)
236
  # [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
237
  # [Make sure to maintain all the UI improvements and error handling]
238
+ @st.cache_data(show_spinner=False, ttl=3600)
239
+ @handle_errors
240
+ def qa_pdf(_pdf_file_path, query, num_clusters=5):
241
+ embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
242
+ llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
243
+
244
+ prompt = ChatPromptTemplate.from_template(
245
+ """Answer this question: {question}
246
+ Using only this context: {context}
247
+ Format your answer with:
248
+ - Clear section headings
249
+ - Bullet points for lists
250
+ - Bold key terms
251
+ - Citations from the text"""
252
+ )
253
+
254
+ loader = PyMuPDFLoader(_pdf_file_path)
255
+ docs = loader.load()
256
+ full_text = "\n".join(doc.page_content for doc in docs)
257
+ cleaned_full_text = clean_text(remove_references(full_text))
258
+
259
+ text_splitter = SpacyTextSplitter(chunk_size=500)
260
+ split_contents = text_splitter.split_text(cleaned_full_text)
261
+
262
+ query_embedding = embeddings_model.embed_query(query)
263
+ similarities = cosine_similarity([query_embedding],
264
+ embeddings_model.embed_documents(split_contents))[0]
265
+ top_indices = np.argsort(similarities)[-num_clusters:]
266
+
267
+ chain = prompt | llm | StrOutputParser()
268
+ return chain.invoke({
269
+ "question": query,
270
+ "context": ' '.join([split_contents[i] for i in top_indices])
271
+ })
272
+
273
+ @st.cache_data(show_spinner=False, ttl=3600)
274
+ @handle_errors
275
+ def process_pdf(_pdf_file_path):
276
+ doc = fitz.open(_pdf_file_path)
277
+ all_figures, all_tables = [], []
278
+ scale_factor = 300 / 50 # High-res to low-res ratio
279
+
280
+ for page in doc:
281
+ low_res = page.get_pixmap(dpi=50)
282
+ low_res_img = np.frombuffer(low_res.samples, dtype=np.uint8).reshape(low_res.height, low_res.width, 3)
283
+
284
+ results = model.predict(low_res_img)
285
+ boxes = [
286
+ (int(box.xyxy[0][0]), int(box.xyxy[0][1]),
287
+ int(box.xyxy[0][2]), int(box.xyxy[0][3]), int(box.cls[0]))
288
+ for result in results for box in result.boxes
289
+ if box.conf[0] > 0.8 and int(box.cls[0]) in {3, 4}
290
+ ]
291
+
292
+ if boxes:
293
+ high_res = page.get_pixmap(dpi=300)
294
+ high_res_img = np.frombuffer(high_res.samples, dtype=np.uint8).reshape(high_res.height, high_res.width, 3)
295
+
296
+ for (x1, y1, x2, y2, cls) in boxes:
297
+ cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
298
+ int(x1*scale_factor):int(x2*scale_factor)]
299
+ if cls == 4:
300
+ all_figures.append(cropped)
301
+ else:
302
+ all_tables.append(cropped)
303
+
304
+ return all_figures, all_tables
305
+
306
+ def image_to_base64(img):
307
+ buffered = io.BytesIO()
308
+ img = Image.fromarray(img).convert("RGB")
309
+ img.thumbnail((800, 800)) # Optimize image size
310
+ img.save(buffered, format="JPEG", quality=85)
311
+ return base64.b64encode(buffered.getvalue()).decode()
312
 
313
  # Streamlit UI Configuration
314
  st.set_page_config(