Update app.py
Browse files
app.py
CHANGED
@@ -235,6 +235,80 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
|
|
235 |
# (Keep the rest of the code from previous implementation for PDF processing and UI)
|
236 |
# [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
|
237 |
# [Make sure to maintain all the UI improvements and error handling]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
# Streamlit UI Configuration
|
240 |
st.set_page_config(
|
|
|
235 |
# (Keep the rest of the code from previous implementation for PDF processing and UI)
|
236 |
# [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
|
237 |
# [Make sure to maintain all the UI improvements and error handling]
|
238 |
+
@st.cache_data(show_spinner=False, ttl=3600)
|
239 |
+
@handle_errors
|
240 |
+
def qa_pdf(_pdf_file_path, query, num_clusters=5):
|
241 |
+
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
|
242 |
+
llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
|
243 |
+
|
244 |
+
prompt = ChatPromptTemplate.from_template(
|
245 |
+
"""Answer this question: {question}
|
246 |
+
Using only this context: {context}
|
247 |
+
Format your answer with:
|
248 |
+
- Clear section headings
|
249 |
+
- Bullet points for lists
|
250 |
+
- Bold key terms
|
251 |
+
- Citations from the text"""
|
252 |
+
)
|
253 |
+
|
254 |
+
loader = PyMuPDFLoader(_pdf_file_path)
|
255 |
+
docs = loader.load()
|
256 |
+
full_text = "\n".join(doc.page_content for doc in docs)
|
257 |
+
cleaned_full_text = clean_text(remove_references(full_text))
|
258 |
+
|
259 |
+
text_splitter = SpacyTextSplitter(chunk_size=500)
|
260 |
+
split_contents = text_splitter.split_text(cleaned_full_text)
|
261 |
+
|
262 |
+
query_embedding = embeddings_model.embed_query(query)
|
263 |
+
similarities = cosine_similarity([query_embedding],
|
264 |
+
embeddings_model.embed_documents(split_contents))[0]
|
265 |
+
top_indices = np.argsort(similarities)[-num_clusters:]
|
266 |
+
|
267 |
+
chain = prompt | llm | StrOutputParser()
|
268 |
+
return chain.invoke({
|
269 |
+
"question": query,
|
270 |
+
"context": ' '.join([split_contents[i] for i in top_indices])
|
271 |
+
})
|
272 |
+
|
273 |
+
@st.cache_data(show_spinner=False, ttl=3600)
|
274 |
+
@handle_errors
|
275 |
+
def process_pdf(_pdf_file_path):
|
276 |
+
doc = fitz.open(_pdf_file_path)
|
277 |
+
all_figures, all_tables = [], []
|
278 |
+
scale_factor = 300 / 50 # High-res to low-res ratio
|
279 |
+
|
280 |
+
for page in doc:
|
281 |
+
low_res = page.get_pixmap(dpi=50)
|
282 |
+
low_res_img = np.frombuffer(low_res.samples, dtype=np.uint8).reshape(low_res.height, low_res.width, 3)
|
283 |
+
|
284 |
+
results = model.predict(low_res_img)
|
285 |
+
boxes = [
|
286 |
+
(int(box.xyxy[0][0]), int(box.xyxy[0][1]),
|
287 |
+
int(box.xyxy[0][2]), int(box.xyxy[0][3]), int(box.cls[0]))
|
288 |
+
for result in results for box in result.boxes
|
289 |
+
if box.conf[0] > 0.8 and int(box.cls[0]) in {3, 4}
|
290 |
+
]
|
291 |
+
|
292 |
+
if boxes:
|
293 |
+
high_res = page.get_pixmap(dpi=300)
|
294 |
+
high_res_img = np.frombuffer(high_res.samples, dtype=np.uint8).reshape(high_res.height, high_res.width, 3)
|
295 |
+
|
296 |
+
for (x1, y1, x2, y2, cls) in boxes:
|
297 |
+
cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
|
298 |
+
int(x1*scale_factor):int(x2*scale_factor)]
|
299 |
+
if cls == 4:
|
300 |
+
all_figures.append(cropped)
|
301 |
+
else:
|
302 |
+
all_tables.append(cropped)
|
303 |
+
|
304 |
+
return all_figures, all_tables
|
305 |
+
|
306 |
+
def image_to_base64(img):
|
307 |
+
buffered = io.BytesIO()
|
308 |
+
img = Image.fromarray(img).convert("RGB")
|
309 |
+
img.thumbnail((800, 800)) # Optimize image size
|
310 |
+
img.save(buffered, format="JPEG", quality=85)
|
311 |
+
return base64.b64encode(buffered.getvalue()).decode()
|
312 |
|
313 |
# Streamlit UI Configuration
|
314 |
st.set_page_config(
|