Spaces:

zliang
/

PDFReadingAssistant

Paused

App Files Files Community

zliang commited on Feb 10

Commit

52d159a

verified ·

1 Parent(s): f8659b6

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -19

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import time
 import io
@@ -77,20 +76,24 @@ def scroll_to_bottom():
         """
         st.components.v1.html(js, height=0)
-# Core processing functions
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
 def summarize_pdf(_pdf_file_path, num_clusters=10):
     embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
     llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
     prompt = ChatPromptTemplate.from_template(
         """Generate a comprehensive summary with these elements:
-        1. Key findings and conclusions
-        2. Main methodologies used
-        3. Important data points
-        4. Limitations mentioned
-        Context: {topic}"""
     )
     loader = PyMuPDFLoader(_pdf_file_path)
@@ -104,11 +107,65 @@ def summarize_pdf(_pdf_file_path, num_clusters=10):
     embeddings = embeddings_model.embed_documents(split_contents)
     kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
     closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
-                      for center in kmeans.cluster_centers_]
     chain = prompt | llm | StrOutputParser()
     return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
 def qa_pdf(_pdf_file_path, query, num_clusters=5):
@@ -117,12 +174,12 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
     prompt = ChatPromptTemplate.from_template(
         """Answer this question: {question}
-        Using only this context: {context}
-        Format your answer with:
-        - Clear section headings
-        - Bullet points for lists
-        - Bold key terms
-        - Citations from the text"""
     )
     loader = PyMuPDFLoader(_pdf_file_path)
@@ -135,7 +192,7 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
     query_embedding = embeddings_model.embed_query(query)
     similarities = cosine_similarity([query_embedding],
-                                   embeddings_model.embed_documents(split_contents))[0]
     top_indices = np.argsort(similarities)[-num_clusters:]
     chain = prompt | llm | StrOutputParser()
@@ -144,6 +201,7 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
         "context": ' '.join([split_contents[i] for i in top_indices])
     })
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
 def process_pdf(_pdf_file_path):
@@ -169,7 +227,7 @@ def process_pdf(_pdf_file_path):
             for (x1, y1, x2, y2, cls) in boxes:
                 cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
-                                     int(x1*scale_factor):int(x2*scale_factor)]
                 if cls == 4:
                     all_figures.append(cropped)
                 else:
@@ -184,7 +242,9 @@ def image_to_base64(img):
     img.save(buffered, format="JPEG", quality=85)
     return base64.b64encode(buffered.getvalue()).decode()
-# Streamlit UI
 st.set_page_config(
     page_title="PDF Assistant",
     page_icon="📄",
@@ -226,6 +286,9 @@ if uploaded_file:
     with open(file_path, "wb") as f:
         f.write(uploaded_file.getbuffer())
     chat_container = st.container()
     with chat_container:
         for idx, chat in enumerate(st.session_state.chat_history):
@@ -246,7 +309,10 @@ if uploaded_file:
             if st.button("📝 Generate Summary", use_container_width=True):
                 with st.spinner("Analyzing document structure..."):
                     show_progress("Generating summary")
-                    summary = summarize_pdf(file_path)
                     st.session_state.chat_history.append({
                         "user": "Summary request",
                         "bot": f"## Document Summary\n{summary}"
@@ -314,4 +380,4 @@ st.markdown("""
         padding: 2rem;
     }
 </style>
-""", unsafe_allow_html=True)

 import os
 import time
 import io
         """
         st.components.v1.html(js, height=0)
+# ----------------------------
+# Core Processing Functions
+# ----------------------------
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
 def summarize_pdf(_pdf_file_path, num_clusters=10):
+    # Basic summarization without citations
     embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
     llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
     prompt = ChatPromptTemplate.from_template(
         """Generate a comprehensive summary with these elements:
+1. Key findings and conclusions
+2. Main methodologies used
+3. Important data points
+4. Limitations mentioned
+Context: {topic}"""
     )
     loader = PyMuPDFLoader(_pdf_file_path)
     embeddings = embeddings_model.embed_documents(split_contents)
     kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
     closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
+                       for center in kmeans.cluster_centers_]
     chain = prompt | llm | StrOutputParser()
     return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
+@st.cache_data(show_spinner=False, ttl=3600)
+@handle_errors
+def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
+    """
+    Generates a summary that includes in-text citations based on selected context chunks.
+    Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
+    After the summary, a reference list is provided.
+    """
+    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
+    llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
+    prompt = ChatPromptTemplate.from_template(
+        """Generate a comprehensive summary with the following elements:
+1. Key findings and conclusions
+2. Main methodologies used
+3. Important data points
+4. Limitations mentioned
+In your summary, include in-text citations formatted as [1], [2], etc., that refer to the source contexts provided below.
+After the summary, provide a reference list mapping each citation number to its corresponding context excerpt.
+Contexts:
+{contexts}"""
+    )
+    loader = PyMuPDFLoader(_pdf_file_path)
+    docs = loader.load()
+    full_text = "\n".join(doc.page_content for doc in docs)
+    cleaned_full_text = clean_text(remove_references(full_text))
+    text_splitter = SpacyTextSplitter(chunk_size=500)
+    split_contents = text_splitter.split_text(cleaned_full_text)
+    embeddings = embeddings_model.embed_documents(split_contents)
+    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
+    citation_indices = []
+    for center in kmeans.cluster_centers_:
+        distances = np.linalg.norm(embeddings - center, axis=1)
+        idx = int(np.argmin(distances))
+        citation_indices.append(idx)
+    # Create a context string with citations (e.g. "[1]: ...", "[2]: ...")
+    citation_contexts = []
+    for i, idx in enumerate(citation_indices):
+        citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
+    combined_contexts = "\n\n".join(citation_contexts)
+    chain = prompt | llm | StrOutputParser()
+    result = chain.invoke({"contexts": combined_contexts})
+    return result
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
 def qa_pdf(_pdf_file_path, query, num_clusters=5):
     prompt = ChatPromptTemplate.from_template(
         """Answer this question: {question}
+Using only this context: {context}
+Format your answer with:
+- Clear section headings
+- Bullet points for lists
+- **Bold** key terms
+- Citations from the text"""
     )
     loader = PyMuPDFLoader(_pdf_file_path)
     query_embedding = embeddings_model.embed_query(query)
     similarities = cosine_similarity([query_embedding],
+                                     embeddings_model.embed_documents(split_contents))[0]
     top_indices = np.argsort(similarities)[-num_clusters:]
     chain = prompt | llm | StrOutputParser()
         "context": ' '.join([split_contents[i] for i in top_indices])
     })
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
 def process_pdf(_pdf_file_path):
             for (x1, y1, x2, y2, cls) in boxes:
                 cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
+                                         int(x1*scale_factor):int(x2*scale_factor)]
                 if cls == 4:
                     all_figures.append(cropped)
                 else:
     img.save(buffered, format="JPEG", quality=85)
     return base64.b64encode(buffered.getvalue()).decode()
+# ----------------------------
+# Streamlit UI Setup
+# ----------------------------
 st.set_page_config(
     page_title="PDF Assistant",
     page_icon="📄",
     with open(file_path, "wb") as f:
         f.write(uploaded_file.getbuffer())
+    # Let the user choose whether to include in-text citations in the summary
+    include_citations = st.checkbox("Include in-text citations in summary", value=True)
     chat_container = st.container()
     with chat_container:
         for idx, chat in enumerate(st.session_state.chat_history):
             if st.button("📝 Generate Summary", use_container_width=True):
                 with st.spinner("Analyzing document structure..."):
                     show_progress("Generating summary")
+                    if include_citations:
+                        summary = summarize_pdf_with_citations(file_path)
+                    else:
+                        summary = summarize_pdf(file_path)
                     st.session_state.chat_history.append({
                         "user": "Summary request",
                         "bot": f"## Document Summary\n{summary}"
         padding: 2rem;
     }
 </style>
+""", unsafe_allow_html=True)