zliang commited on
Commit
f8659b6
·
verified ·
1 Parent(s): e2f8798

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -126
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import time
3
  import io
@@ -26,8 +27,6 @@ model = YOLO("best.pt")
26
  openai_api_key = os.environ.get("openai_api_key")
27
  MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
28
 
29
- llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
30
-
31
  # Utility functions
32
  @st.cache_data(show_spinner=False, ttl=3600)
33
  def clean_text(text):
@@ -81,154 +80,69 @@ def scroll_to_bottom():
81
  # Core processing functions
82
  @st.cache_data(show_spinner=False, ttl=3600)
83
  @handle_errors
84
- def summarize_pdf(pdf_file_path, num_clusters=10):
85
- # Keep track of page numbers for each chunk
86
- loader = PyMuPDFLoader(pdf_file_path)
87
- docs = loader.load()
88
-
89
- # Create chunks with page numbers
90
- text_splitter = SpacyTextSplitter(chunk_size=500)
91
- chunks_with_metadata = []
92
- for doc in docs:
93
- chunks = text_splitter.split_text(doc.page_content)
94
- for chunk in chunks:
95
- chunks_with_metadata.append({
96
- "text": chunk,
97
- "page": doc.metadata["page"] + 1 # Convert to 1-based numbering
98
- })
99
 
100
- # Modified prompt for citation formatting
101
  prompt = ChatPromptTemplate.from_template(
102
- """Generate a summary with inline citations for each key point using [Source X] format.
103
- Structure your response as:
104
-
105
- ## Comprehensive Summary
106
- {summary_content}
107
-
108
- ## Source References
109
- {sources_list}
110
-
111
- Contexts: {topic}"""
112
  )
113
 
114
- # Create source mapping
115
- sources = [f"Source {i+1}: Page {chunk['page']}"
116
- for i, chunk in enumerate(chunks_with_metadata)]
117
-
118
- # Generate summary with citations
119
- chain = prompt | llm | StrOutputParser()
120
- results = chain.invoke({
121
- "topic": ' '.join([chunk["text"] for chunk in chunks_with_metadata]),
122
- "sources_list": "\n".join(sources)
123
- })
124
-
125
- return add_interactive_citations(results, chunks_with_metadata)
126
-
127
-
128
- def add_interactive_citations(summary_text, source_chunks):
129
- # Create source boxes with page numbers and full text
130
- sources_html = """<div style="margin-top: 20px; border-top: 2px solid #e0e0e0; padding-top: 15px;">
131
- <h4>📚 Source References</h4>"""
132
-
133
- for idx, chunk in enumerate(source_chunks):
134
- sources_html += f"""
135
- <div id="source-{idx+1}" style="margin: 10px 0; padding: 10px;
136
- border: 1px solid #e0e0e0; border-radius: 5px;
137
- transition: all 0.3s ease;">
138
- <div style="display: flex; justify-content: space-between;">
139
- <strong>Source {idx+1}</strong>
140
- <span style="color: #666;">Page {chunk['page']}</span>
141
- </div>
142
- <div style="margin-top: 5px; color: #444; font-size: 0.9em;">
143
- {chunk['text']}
144
- </div>
145
- </div>
146
- """
147
- sources_html += "</div>"
148
 
149
- # Add click interactions
150
- interaction_js = """
151
- <script>
152
- document.querySelectorAll('[data-citation]').forEach(item => {
153
- item.addEventListener('click', function(e) {
154
- const sourceId = this.getAttribute('data-source');
155
- const sourceDiv = document.getElementById(sourceId);
156
-
157
- // Highlight animation
158
- sourceDiv.style.border = '2px solid #4CAF50';
159
- sourceDiv.style.boxShadow = '0 2px 8px rgba(76,175,80,0.3)';
160
-
161
- setTimeout(() => {
162
- sourceDiv.style.border = '1px solid #e0e0e0';
163
- sourceDiv.style.boxShadow = 'none';
164
- }, 1000);
165
-
166
- // Smooth scroll
167
- sourceDiv.scrollIntoView({behavior: 'smooth'});
168
- });
169
- });
170
- </script>
171
- """
172
 
173
- # Replace citations with interactive elements
174
- cited_summary = re.sub(r'\[Source (\d+)\]',
175
- lambda m: f'<a data-citation="true" data-source="source-{m.group(1)}" '
176
- f'style="cursor: pointer; color: #4CAF50; text-decoration: underline;">'
177
- f'[Source {m.group(1)}]</a>',
178
- summary_text)
179
 
180
- return f"{cited_summary}{sources_html}{interaction_js}"
 
181
 
182
  @st.cache_data(show_spinner=False, ttl=3600)
183
  @handle_errors
184
  def qa_pdf(_pdf_file_path, query, num_clusters=5):
185
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
186
- # llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
 
 
 
 
 
 
 
 
 
 
187
 
188
- # Load PDF with page numbers
189
  loader = PyMuPDFLoader(_pdf_file_path)
190
  docs = loader.load()
 
 
191
 
192
- # Create chunks with page metadata
193
  text_splitter = SpacyTextSplitter(chunk_size=500)
194
- chunks_with_metadata = []
195
- for doc in docs:
196
- chunks = text_splitter.split_text(doc.page_content)
197
- for chunk in chunks:
198
- chunks_with_metadata.append({
199
- "text": clean_text(chunk),
200
- "page": doc.metadata["page"] + 1
201
- })
202
 
203
- # Find relevant chunks
204
- embeddings = embeddings_model.embed_documents([chunk["text"] for chunk in chunks_with_metadata])
205
  query_embedding = embeddings_model.embed_query(query)
206
- similarities = cosine_similarity([query_embedding], embeddings)[0]
 
207
  top_indices = np.argsort(similarities)[-num_clusters:]
208
 
209
- # Prepare prompt with citation instructions
210
- prompt = ChatPromptTemplate.from_template(
211
- """Answer this question with inline citations using [Source X] format:
212
- {question}
213
-
214
- Use these verified sources:
215
- {context}
216
-
217
- Structure your answer with:
218
- - Clear section headings
219
- - Bullet points for lists
220
- - Citations for all factual claims"""
221
- )
222
-
223
  chain = prompt | llm | StrOutputParser()
224
- raw_answer = chain.invoke({
225
  "question": query,
226
- "context": '\n\n'.join([f"Source {i+1} (Page {chunks_with_metadata[i]['page']}): {chunks_with_metadata[i]['text']}"
227
- for i in top_indices])
228
  })
229
-
230
- return generate_interactive_citations(raw_answer, [chunks_with_metadata[i] for i in top_indices])
231
-
232
 
233
  @st.cache_data(show_spinner=False, ttl=3600)
234
  @handle_errors
 
1
+
2
  import os
3
  import time
4
  import io
 
27
  openai_api_key = os.environ.get("openai_api_key")
28
  MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
29
 
 
 
30
  # Utility functions
31
  @st.cache_data(show_spinner=False, ttl=3600)
32
  def clean_text(text):
 
80
  # Core processing functions
81
  @st.cache_data(show_spinner=False, ttl=3600)
82
  @handle_errors
83
+ def summarize_pdf(_pdf_file_path, num_clusters=10):
84
+ embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
85
+ llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
87
  prompt = ChatPromptTemplate.from_template(
88
+ """Generate a comprehensive summary with these elements:
89
+ 1. Key findings and conclusions
90
+ 2. Main methodologies used
91
+ 3. Important data points
92
+ 4. Limitations mentioned
93
+ Context: {topic}"""
 
 
 
 
94
  )
95
 
96
+ loader = PyMuPDFLoader(_pdf_file_path)
97
+ docs = loader.load()
98
+ full_text = "\n".join(doc.page_content for doc in docs)
99
+ cleaned_full_text = clean_text(remove_references(full_text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ text_splitter = SpacyTextSplitter(chunk_size=500)
102
+ split_contents = text_splitter.split_text(cleaned_full_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ embeddings = embeddings_model.embed_documents(split_contents)
105
+ kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
106
+ closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
107
+ for center in kmeans.cluster_centers_]
 
 
108
 
109
+ chain = prompt | llm | StrOutputParser()
110
+ return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
111
 
112
  @st.cache_data(show_spinner=False, ttl=3600)
113
  @handle_errors
114
  def qa_pdf(_pdf_file_path, query, num_clusters=5):
115
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
116
+ llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
117
+
118
+ prompt = ChatPromptTemplate.from_template(
119
+ """Answer this question: {question}
120
+ Using only this context: {context}
121
+ Format your answer with:
122
+ - Clear section headings
123
+ - Bullet points for lists
124
+ - Bold key terms
125
+ - Citations from the text"""
126
+ )
127
 
 
128
  loader = PyMuPDFLoader(_pdf_file_path)
129
  docs = loader.load()
130
+ full_text = "\n".join(doc.page_content for doc in docs)
131
+ cleaned_full_text = clean_text(remove_references(full_text))
132
 
 
133
  text_splitter = SpacyTextSplitter(chunk_size=500)
134
+ split_contents = text_splitter.split_text(cleaned_full_text)
 
 
 
 
 
 
 
135
 
 
 
136
  query_embedding = embeddings_model.embed_query(query)
137
+ similarities = cosine_similarity([query_embedding],
138
+ embeddings_model.embed_documents(split_contents))[0]
139
  top_indices = np.argsort(similarities)[-num_clusters:]
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  chain = prompt | llm | StrOutputParser()
142
+ return chain.invoke({
143
  "question": query,
144
+ "context": ' '.join([split_contents[i] for i in top_indices])
 
145
  })
 
 
 
146
 
147
  @st.cache_data(show_spinner=False, ttl=3600)
148
  @handle_errors