zliang commited on
Commit
6567523
·
verified ·
1 Parent(s): 746d859

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -38
app.py CHANGED
@@ -79,34 +79,103 @@ def scroll_to_bottom():
79
  # Core processing functions
80
  @st.cache_data(show_spinner=False, ttl=3600)
81
  @handle_errors
82
- def summarize_pdf(_pdf_file_path, num_clusters=10):
83
- embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
84
- llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
 
 
 
 
 
 
 
 
 
 
 
 
85
 
 
86
  prompt = ChatPromptTemplate.from_template(
87
- """Generate a comprehensive summary with these elements:
88
- 1. Key findings and conclusions
89
- 2. Main methodologies used
90
- 3. Important data points
91
- 4. Limitations mentioned
92
- Context: {topic}"""
 
 
 
 
93
  )
94
 
95
- loader = PyMuPDFLoader(_pdf_file_path)
96
- docs = loader.load()
97
- full_text = "\n".join(doc.page_content for doc in docs)
98
- cleaned_full_text = clean_text(remove_references(full_text))
99
 
100
- text_splitter = SpacyTextSplitter(chunk_size=500)
101
- split_contents = text_splitter.split_text(cleaned_full_text)
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- embeddings = embeddings_model.embed_documents(split_contents)
104
- kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
105
- closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
106
- for center in kmeans.cluster_centers_]
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- chain = prompt | llm | StrOutputParser()
109
- return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  @st.cache_data(show_spinner=False, ttl=3600)
112
  @handle_errors
@@ -114,34 +183,50 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
114
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
115
  llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
116
 
117
- prompt = ChatPromptTemplate.from_template(
118
- """Answer this question: {question}
119
- Using only this context: {context}
120
- Format your answer with:
121
- - Clear section headings
122
- - Bullet points for lists
123
- - Bold key terms
124
- - Citations from the text"""
125
- )
126
-
127
  loader = PyMuPDFLoader(_pdf_file_path)
128
  docs = loader.load()
129
- full_text = "\n".join(doc.page_content for doc in docs)
130
- cleaned_full_text = clean_text(remove_references(full_text))
131
 
 
132
  text_splitter = SpacyTextSplitter(chunk_size=500)
133
- split_contents = text_splitter.split_text(cleaned_full_text)
 
 
 
 
 
 
 
134
 
 
 
135
  query_embedding = embeddings_model.embed_query(query)
136
- similarities = cosine_similarity([query_embedding],
137
- embeddings_model.embed_documents(split_contents))[0]
138
  top_indices = np.argsort(similarities)[-num_clusters:]
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  chain = prompt | llm | StrOutputParser()
141
- return chain.invoke({
142
  "question": query,
143
- "context": ' '.join([split_contents[i] for i in top_indices])
 
144
  })
 
 
 
145
 
146
  @st.cache_data(show_spinner=False, ttl=3600)
147
  @handle_errors
 
79
  # Core processing functions
80
  @st.cache_data(show_spinner=False, ttl=3600)
81
  @handle_errors
82
+ def summarize_pdf(pdf_file_path, num_clusters=10):
83
+ # Keep track of page numbers for each chunk
84
+ loader = PyMuPDFLoader(pdf_file_path)
85
+ docs = loader.load()
86
+
87
+ # Create chunks with page numbers
88
+ text_splitter = SpacyTextSplitter(chunk_size=500)
89
+ chunks_with_metadata = []
90
+ for doc in docs:
91
+ chunks = text_splitter.split_text(doc.page_content)
92
+ for chunk in chunks:
93
+ chunks_with_metadata.append({
94
+ "text": chunk,
95
+ "page": doc.metadata["page"] + 1 # Convert to 1-based numbering
96
+ })
97
 
98
+ # Modified prompt for citation formatting
99
  prompt = ChatPromptTemplate.from_template(
100
+ """Generate a summary with inline citations for each key point using [Source X] format.
101
+ Structure your response as:
102
+
103
+ ## Comprehensive Summary
104
+ {summary_content}
105
+
106
+ ## Source References
107
+ {sources_list}
108
+
109
+ Contexts: {topic}"""
110
  )
111
 
112
+ # Create source mapping
113
+ sources = [f"Source {i+1}: Page {chunk['page']}"
114
+ for i, chunk in enumerate(chunks_with_metadata)]
 
115
 
116
+ # Generate summary with citations
117
+ chain = prompt | llm | StrOutputParser()
118
+ results = chain.invoke({
119
+ "topic": ' '.join([chunk["text"] for chunk in chunks_with_metadata]),
120
+ "sources_list": "\n".join(sources)
121
+ })
122
+
123
+ return add_interactive_citations(results, chunks_with_metadata)
124
+
125
+
126
+ def add_interactive_citations(summary_text, source_chunks):
127
+ # Create source boxes with page numbers and full text
128
+ sources_html = """<div style="margin-top: 20px; border-top: 2px solid #e0e0e0; padding-top: 15px;">
129
+ <h4>📚 Source References</h4>"""
130
 
131
+ for idx, chunk in enumerate(source_chunks):
132
+ sources_html += f"""
133
+ <div id="source-{idx+1}" style="margin: 10px 0; padding: 10px;
134
+ border: 1px solid #e0e0e0; border-radius: 5px;
135
+ transition: all 0.3s ease;">
136
+ <div style="display: flex; justify-content: space-between;">
137
+ <strong>Source {idx+1}</strong>
138
+ <span style="color: #666;">Page {chunk['page']}</span>
139
+ </div>
140
+ <div style="margin-top: 5px; color: #444; font-size: 0.9em;">
141
+ {chunk['text']}
142
+ </div>
143
+ </div>
144
+ """
145
+ sources_html += "</div>"
146
 
147
+ # Add click interactions
148
+ interaction_js = """
149
+ <script>
150
+ document.querySelectorAll('[data-citation]').forEach(item => {
151
+ item.addEventListener('click', function(e) {
152
+ const sourceId = this.getAttribute('data-source');
153
+ const sourceDiv = document.getElementById(sourceId);
154
+
155
+ // Highlight animation
156
+ sourceDiv.style.border = '2px solid #4CAF50';
157
+ sourceDiv.style.boxShadow = '0 2px 8px rgba(76,175,80,0.3)';
158
+
159
+ setTimeout(() => {
160
+ sourceDiv.style.border = '1px solid #e0e0e0';
161
+ sourceDiv.style.boxShadow = 'none';
162
+ }, 1000);
163
+
164
+ // Smooth scroll
165
+ sourceDiv.scrollIntoView({behavior: 'smooth'});
166
+ });
167
+ });
168
+ </script>
169
+ """
170
+
171
+ # Replace citations with interactive elements
172
+ cited_summary = re.sub(r'\[Source (\d+)\]',
173
+ lambda m: f'<a data-citation="true" data-source="source-{m.group(1)}" '
174
+ f'style="cursor: pointer; color: #4CAF50; text-decoration: underline;">'
175
+ f'[Source {m.group(1)}]</a>',
176
+ summary_text)
177
+
178
+ return f"{cited_summary}{sources_html}{interaction_js}"
179
 
180
  @st.cache_data(show_spinner=False, ttl=3600)
181
  @handle_errors
 
183
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
184
  llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
185
 
186
+ # Load PDF with page numbers
 
 
 
 
 
 
 
 
 
187
  loader = PyMuPDFLoader(_pdf_file_path)
188
  docs = loader.load()
 
 
189
 
190
+ # Create chunks with page metadata
191
  text_splitter = SpacyTextSplitter(chunk_size=500)
192
+ chunks_with_metadata = []
193
+ for doc in docs:
194
+ chunks = text_splitter.split_text(doc.page_content)
195
+ for chunk in chunks:
196
+ chunks_with_metadata.append({
197
+ "text": clean_text(chunk),
198
+ "page": doc.metadata["page"] + 1
199
+ })
200
 
201
+ # Find relevant chunks
202
+ embeddings = embeddings_model.embed_documents([chunk["text"] for chunk in chunks_with_metadata])
203
  query_embedding = embeddings_model.embed_query(query)
204
+ similarities = cosine_similarity([query_embedding], embeddings)[0]
 
205
  top_indices = np.argsort(similarities)[-num_clusters:]
206
 
207
+ # Prepare prompt with citation instructions
208
+ prompt = ChatPromptTemplate.from_template(
209
+ """Answer this question with inline citations using [Source X] format:
210
+ {question}
211
+
212
+ Use these verified sources:
213
+ {context}
214
+
215
+ Structure your answer with:
216
+ - Clear section headings
217
+ - Bullet points for lists
218
+ - Citations for all factual claims"""
219
+ )
220
+
221
  chain = prompt | llm | StrOutputParser()
222
+ raw_answer = chain.invoke({
223
  "question": query,
224
+ "context": '\n\n'.join([f"Source {i+1} (Page {chunks_with_metadata[i]['page']}): {chunks_with_metadata[i]['text']}"
225
+ for i in top_indices])
226
  })
227
+
228
+ return generate_interactive_citations(raw_answer, [chunks_with_metadata[i] for i in top_indices])
229
+
230
 
231
  @st.cache_data(show_spinner=False, ttl=3600)
232
  @handle_errors