Spaces:

zliang
/

PDFReadingAssistant

Paused

App Files Files Community

zliang commited on Feb 10

Commit

3d8d27c

verified ·

1 Parent(s): 3a16e8c

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -66

app.py CHANGED Viewed

@@ -81,26 +81,33 @@ def scroll_to_bottom():
 # ----------------------------
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
-def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
     """
-    Generates a summary that includes in-text citations based on selected context chunks.
-    Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
-    After the summary, a reference list is provided mapping each citation number to the full original text excerpt.
     """
     embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
     llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
-    # Updated prompt instructs the LLM to use the full excerpt in the reference list.
     prompt = ChatPromptTemplate.from_template(
-        """Generate a comprehensive summary with the following elements:
 1. Key findings and conclusions
 2. Main methodologies used
 3. Important data points
 4. Limitations mentioned
-For any information that is directly derived from the provided context excerpts, insert an in-text citation in the format [n] where n corresponds to the excerpt number.
-After the summary, please provide a reference list where each citation number is mapped to the full original text excerpt as provided below. Do not simply echo the citation number; include the complete excerpt text.
 Context Excerpts:
 {contexts}"""
@@ -123,11 +130,12 @@ Context Excerpts:
         idx = int(np.argmin(distances))
         citation_indices.append(idx)
-    # Create a context string with citations including the full original text excerpts
     citation_contexts = []
     for i, idx in enumerate(citation_indices):
-        # Using the full excerpt from split_contents for the reference list.
-        citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
     combined_contexts = "\n\n".join(citation_contexts)
     chain = prompt | llm | StrOutputParser()
@@ -136,57 +144,6 @@ Context Excerpts:
-@st.cache_data(show_spinner=False, ttl=3600)
-@handle_errors
-def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
-    """
-    Generates a summary that includes in-text citations based on selected context chunks.
-    Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
-    After the summary, a reference list is provided.
-    """
-    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
-    llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
-    prompt = ChatPromptTemplate.from_template(
-        """Generate a comprehensive summary with the following elements:
-1. Key findings and conclusions
-2. Main methodologies used
-3. Important data points
-4. Limitations mentioned
-In your summary, include in-text citations formatted as [1], [2], etc., that refer to the source contexts provided below.
-After the summary, provide a reference list mapping each citation number to its corresponding context excerpt.
-Contexts:
-{contexts}"""
-    )
-    loader = PyMuPDFLoader(_pdf_file_path)
-    docs = loader.load()
-    full_text = "\n".join(doc.page_content for doc in docs)
-    cleaned_full_text = clean_text(remove_references(full_text))
-    text_splitter = SpacyTextSplitter(chunk_size=500)
-    split_contents = text_splitter.split_text(cleaned_full_text)
-    embeddings = embeddings_model.embed_documents(split_contents)
-    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
-    citation_indices = []
-    for center in kmeans.cluster_centers_:
-        distances = np.linalg.norm(embeddings - center, axis=1)
-        idx = int(np.argmin(distances))
-        citation_indices.append(idx)
-    # Create a context string with citations (e.g. "[1]: ...", "[2]: ...")
-    citation_contexts = []
-    for i, idx in enumerate(citation_indices):
-        citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
-    combined_contexts = "\n\n".join(citation_contexts)
-    chain = prompt | llm | StrOutputParser()
-    result = chain.invoke({"contexts": combined_contexts})
-    return result
 @st.cache_data(show_spinner=False, ttl=3600)
@@ -332,15 +289,13 @@ if uploaded_file:
             if st.button("📝 Generate Summary", use_container_width=True):
                 with st.spinner("Analyzing document structure..."):
                     show_progress("Generating summary")
-                    if include_citations:
-                        summary = summarize_pdf_with_citations(file_path)
-                    else:
-                        summary = summarize_pdf(file_path)
                     st.session_state.chat_history.append({
                         "user": "Summary request",
                         "bot": f"## Document Summary\n{summary}"
                     })
                     st.rerun()
         with col3:
             if st.button("🖼️ Extract Visuals", use_container_width=True):
                 with st.spinner("Identifying figures and tables..."):
@@ -402,5 +357,25 @@ st.markdown("""
         border-radius: 12px;
         padding: 2rem;
     }
 </style>
 """, unsafe_allow_html=True)

 # ----------------------------
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
+@st.cache_data(show_spinner=False, ttl=3600)
+@handle_errors
+def summarize_pdf_with_tooltips(_pdf_file_path, num_clusters=10):
     """
+    Generates a summary with in-text citations that display the full excerpt as a tooltip on hover.
+    Each citation is embedded as an HTML span element with the tooltip text.
     """
     embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
     llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
     prompt = ChatPromptTemplate.from_template(
+        """Generate a comprehensive summary that includes the following:
 1. Key findings and conclusions
 2. Main methodologies used
 3. Important data points
 4. Limitations mentioned
+For any information directly derived from the context excerpts provided below, insert an in-text citation as an HTML tooltip.
+For each citation, use the following HTML format:
+<span class="tooltip" data-tooltip="{full_text}">[{n}]</span>
+Where:
+- {n} is the citation number.
+- {full_text} is the complete excerpt text for that citation.
+Do not provide a separate reference list. Instead, embed the full citation text directly in the tooltip.
 Context Excerpts:
 {contexts}"""
         idx = int(np.argmin(distances))
         citation_indices.append(idx)
+    # Build the context excerpts string.
     citation_contexts = []
     for i, idx in enumerate(citation_indices):
+        # Replace double quotes to avoid breaking HTML attribute quotes.
+        excerpt = split_contents[idx].replace('"', "'")
+        citation_contexts.append(f"[{i+1}]: {excerpt}")
     combined_contexts = "\n\n".join(citation_contexts)
     chain = prompt | llm | StrOutputParser()
 @st.cache_data(show_spinner=False, ttl=3600)
             if st.button("📝 Generate Summary", use_container_width=True):
                 with st.spinner("Analyzing document structure..."):
                     show_progress("Generating summary")
+                    summary = summarize_pdf_with_tooltips(file_path)
                     st.session_state.chat_history.append({
                         "user": "Summary request",
                         "bot": f"## Document Summary\n{summary}"
                     })
                     st.rerun()
         with col3:
             if st.button("🖼️ Extract Visuals", use_container_width=True):
                 with st.spinner("Identifying figures and tables..."):
         border-radius: 12px;
         padding: 2rem;
     }
+    .tooltip {
+      position: relative;
+      cursor: pointer;
+      border-bottom: 1px dotted #555;
+    }
+    /* Tooltip text */
+    .tooltip:hover::after {
+      content: attr(data-tooltip);
+      position: absolute;
+      left: 0;
+      top: 1.5em;
+      background: #333;
+      color: #fff;
+      padding: 5px 10px;
+      border-radius: 5px;
+      white-space: pre-wrap;
+      z-index: 100;
+      width: 300px; /* Adjust width as needed */
+    }
 </style>
 """, unsafe_allow_html=True)