Spaces:

zliang
/

PDFReadingAssistant

Paused

App Files Files Community

zliang commited on Feb 10

Commit

3a16e8c

verified ·

1 Parent(s): 52d159a

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -8

app.py CHANGED Viewed

@@ -79,21 +79,31 @@ def scroll_to_bottom():
 # ----------------------------
 # Core Processing Functions
 # ----------------------------
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
-def summarize_pdf(_pdf_file_path, num_clusters=10):
-    # Basic summarization without citations
     embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
     llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
     prompt = ChatPromptTemplate.from_template(
-        """Generate a comprehensive summary with these elements:
 1. Key findings and conclusions
 2. Main methodologies used
 3. Important data points
 4. Limitations mentioned
-Context: {topic}"""
     )
     loader = PyMuPDFLoader(_pdf_file_path)
@@ -106,11 +116,24 @@ Context: {topic}"""
     embeddings = embeddings_model.embed_documents(split_contents)
     kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
-    closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
-                       for center in kmeans.cluster_centers_]
     chain = prompt | llm | StrOutputParser()
-    return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
 @st.cache_data(show_spinner=False, ttl=3600)

 # ----------------------------
 # Core Processing Functions
 # ----------------------------
 @st.cache_data(show_spinner=False, ttl=3600)
 @handle_errors
+def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
+    """
+    Generates a summary that includes in-text citations based on selected context chunks.
+    Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
+    After the summary, a reference list is provided mapping each citation number to the full original text excerpt.
+    """
     embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
     llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
+    # Updated prompt instructs the LLM to use the full excerpt in the reference list.
     prompt = ChatPromptTemplate.from_template(
+        """Generate a comprehensive summary with the following elements:
 1. Key findings and conclusions
 2. Main methodologies used
 3. Important data points
 4. Limitations mentioned
+For any information that is directly derived from the provided context excerpts, insert an in-text citation in the format [n] where n corresponds to the excerpt number.
+After the summary, please provide a reference list where each citation number is mapped to the full original text excerpt as provided below. Do not simply echo the citation number; include the complete excerpt text.
+Context Excerpts:
+{contexts}"""
     )
     loader = PyMuPDFLoader(_pdf_file_path)
     embeddings = embeddings_model.embed_documents(split_contents)
     kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
+    citation_indices = []
+    for center in kmeans.cluster_centers_:
+        distances = np.linalg.norm(embeddings - center, axis=1)
+        idx = int(np.argmin(distances))
+        citation_indices.append(idx)
+    # Create a context string with citations including the full original text excerpts
+    citation_contexts = []
+    for i, idx in enumerate(citation_indices):
+        # Using the full excerpt from split_contents for the reference list.
+        citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
+    combined_contexts = "\n\n".join(citation_contexts)
     chain = prompt | llm | StrOutputParser()
+    result = chain.invoke({"contexts": combined_contexts})
+    return result
 @st.cache_data(show_spinner=False, ttl=3600)