ProfessorLeVesseur commited on
Commit
648dea1
·
verified ·
1 Parent(s): aef4af3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -0
app.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---------------------------------------------------------------------------------------
2
+ # Imports and Options
3
+ # ---------------------------------------------------------------------------------------
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import requests
7
+ import re
8
+ import fitz # PyMuPDF
9
+ import io
10
+ import matplotlib.pyplot as plt
11
+ from PIL import Image
12
+ from mlx_vlm import load, generate
13
+ from mlx_vlm.prompt_utils import apply_chat_template
14
+ from mlx_vlm.utils import load_config, stream_generate
15
+ from docling_core.types.doc.document import DocTagsDocument, DoclingDocument
16
+
17
+ # Set Streamlit to wide mode
18
+ # st.set_page_config(layout="wide")
19
+
20
+ # ---------------------------------------------------------------------------------------
21
+ # API Configuration
22
+ # ---------------------------------------------------------------------------------------
23
+ API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
24
+ headers = {
25
+ 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
26
+ 'Content-Type': 'application/json'
27
+ }
28
+
29
+ # ---------------------------------------------------------------------------------------
30
+ # Survey Analysis Class
31
+ # ---------------------------------------------------------------------------------------
32
+ class SurveyAnalysis:
33
+ def __init__(self, api_key=None):
34
+ self.api_key = api_key
35
+
36
+ def prepare_llm_input(self, survey_response, topics):
37
+ # Create topic description string from user input
38
+ topic_descriptions = "\n".join([f"- **{topic}**: {description}" for topic, description in topics.items()])
39
+
40
+ llm_input = f"""
41
+ Your task is to review PDF docling and extract information related to the provided topics. Here are the topic descriptions:
42
+
43
+ {topic_descriptions}
44
+
45
+ **Instructions:**
46
+ - Extract and summarize the PDF focusing only on the provided topics.
47
+ - If a topic is not mentioned in the notes, it should not be included in the Topic_Summary.
48
+ - Use **exact quotes** from the original text for each point in your Topic_Summary.
49
+ - Exclude erroneous content.
50
+ - Do not add additional explanations or instructions.
51
+
52
+ **Format your response as follows:**
53
+ [Topic]
54
+ - "Exact quote"
55
+ - "Exact quote"
56
+ - "Exact quote"
57
+
58
+ **Meeting Notes:**
59
+ {survey_response}
60
+ """
61
+ return llm_input
62
+
63
+ def query_api(self, payload):
64
+ response = requests.post(API_URL, headers=headers, json=payload)
65
+ return response.json()
66
+
67
+ def extract_meeting_notes(self, response):
68
+ output = response.get('outputs', {}).get('out-0', '')
69
+ return output
70
+
71
+ def process_dataframe(self, df, topics):
72
+ results = []
73
+ for _, row in df.iterrows():
74
+ llm_input = self.prepare_llm_input(row['Document_Text'], topics)
75
+ payload = {
76
+ "user_id": "<USER or Conversation ID>",
77
+ "in-0": llm_input
78
+ }
79
+ response = self.query_api(payload)
80
+ meeting_notes = self.extract_meeting_notes(response)
81
+ results.append({
82
+ 'Document_Text': row['Document_Text'],
83
+ 'Topic_Summary': meeting_notes
84
+ })
85
+
86
+ result_df = pd.DataFrame(results)
87
+ df = df.reset_index(drop=True)
88
+ return pd.concat([df, result_df[['Topic_Summary']]], axis=1)
89
+
90
+ # ---------------------------------------------------------------------------------------
91
+ # Function to Extract Excerpts
92
+ # ---------------------------------------------------------------------------------------
93
+ def extract_excerpts(processed_df):
94
+ new_rows = []
95
+
96
+ for _, row in processed_df.iterrows():
97
+ Topic_Summary = row['Topic_Summary']
98
+
99
+ # Split the Topic_Summary by topic
100
+ sections = re.split(r'\n(?=\[)', Topic_Summary)
101
+
102
+ for section in sections:
103
+ # Extract the topic
104
+ topic_match = re.match(r'\[([^\]]+)\]', section)
105
+ if topic_match:
106
+ topic = topic_match.group(1)
107
+
108
+ # Extract all excerpts within the section
109
+ excerpts = re.findall(r'- "([^"]+)"', section)
110
+
111
+ for excerpt in excerpts:
112
+ new_rows.append({
113
+ 'Document_Text': row['Document_Text'],
114
+ 'Topic_Summary': row['Topic_Summary'],
115
+ 'Excerpt': excerpt,
116
+ 'Topic': topic
117
+ })
118
+
119
+ return pd.DataFrame(new_rows)
120
+
121
+ #------------------------------------------------------------------------
122
+ # Streamlit Configuration
123
+ #------------------------------------------------------------------------
124
+
125
+ # Set page configuration
126
+ st.set_page_config(
127
+ page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
128
+ page_icon=":bar_chart:",
129
+ layout="centered",
130
+ initial_sidebar_state="auto",
131
+ menu_items={
132
+ 'Get Help': 'mailto:[email protected]',
133
+ 'About': "This app is built to support PDF analysis"
134
+ }
135
+ )
136
+
137
+ #------------------------------------------------------------------------
138
+ # Sidebar
139
+ #------------------------------------------------------------------------
140
+
141
+ # Sidebar with image
142
+ with st.sidebar:
143
+ # Set the desired width in pixels
144
+ image_width = 300
145
+ # Define the path to the image
146
+ # image_path = "steelcase_small.png"
147
+ image_path = "/Users/clevesse/Documents/VSC_Code/PDF_Extraction/PDF_Extraction_streamlit/steelcase_small.png"
148
+ # Display the image
149
+ st.image(image_path, width=image_width)
150
+
151
+ # Additional sidebar content
152
+
153
+ with st.expander("**WorkSpace Futures**", expanded=True):
154
+ st.write("""
155
+ Strategic Market Intelligence
156
+ Director: Amy Willard
157
+
158
+ - **Support**: Cheyne LeVesseur PhD
159
+ - **Email**: [email protected]
160
+ """)
161
+ st.divider()
162
+ st.subheader('Instructions')
163
+
164
+ Instructions = """
165
+ - **Step 1**: Upload your PDF file.
166
+ - **Step 2**: Review the processed meeting notes with extracted excerpts and classifications.
167
+ - **Step 3**: Review topic descriptions.
168
+ - **Step 4**: Review topic distribution and frequency.
169
+ - **Step 5**: Review bar charts of topics.
170
+ - **Step 6**: Download the processed data as a CSV file.
171
+ """
172
+ st.markdown(Instructions)
173
+
174
+ # Load SmolDocling model (mlx_vlm version)
175
+ @st.cache_resource
176
+ def load_smol_docling():
177
+ model_path = "ds4sd/SmolDocling-256M-preview-mlx-bf16"
178
+ model, processor = load(model_path)
179
+ config = load_config(model_path)
180
+ return model, processor, config
181
+
182
+ model, processor, config = load_smol_docling()
183
+
184
+ # Convert PDF to images
185
+ def convert_pdf_to_images(pdf_file):
186
+ images = []
187
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
188
+ for page_number in range(len(doc)):
189
+ page = doc.load_page(page_number)
190
+ pix = page.get_pixmap(dpi=300) # Higher DPI for clarity
191
+ img_data = pix.tobytes("png")
192
+ image = Image.open(io.BytesIO(img_data))
193
+ images.append(image)
194
+ return images
195
+
196
+ # Extract structured markdown text using SmolDocling (mlx_vlm)
197
+ def extract_markdown_from_image(image):
198
+ prompt = "Convert this page to docling."
199
+ formatted_prompt = apply_chat_template(processor, config, prompt, num_images=1)
200
+ output = ""
201
+
202
+ for token in stream_generate(
203
+ model, processor, formatted_prompt, [image], max_tokens=4096, verbose=False):
204
+ output += token.text
205
+ if "</doctag>" in token.text:
206
+ break
207
+
208
+ # Convert DocTags to Markdown
209
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([output], [image])
210
+ doc = DoclingDocument(name="ExtractedDocument")
211
+ doc.load_from_doctags(doctags_doc)
212
+ markdown_text = doc.export_to_markdown()
213
+ return markdown_text
214
+
215
+ # Streamlit UI
216
+ st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
217
+
218
+ uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
219
+
220
+ if uploaded_file:
221
+ with st.spinner("Processing PDF..."):
222
+ images = convert_pdf_to_images(uploaded_file)
223
+
224
+ markdown_texts = []
225
+ for idx, image in enumerate(images):
226
+ markdown_text = extract_markdown_from_image(image)
227
+ markdown_texts.append(markdown_text)
228
+
229
+ df = pd.DataFrame({'Document_Text': markdown_texts})
230
+
231
+ st.success("PDF processed successfully!")
232
+
233
+ # Check if extraction was successful
234
+ if df.empty or df['Document_Text'].isnull().all():
235
+ st.error("No meaningful text extracted from the PDF.")
236
+ st.stop()
237
+
238
+ st.markdown("### Extracted Markdown Preview")
239
+ st.write(df.head())
240
+
241
+ # ---------------------------------------------------------------------------------------
242
+ # User Input for Topics
243
+ # ---------------------------------------------------------------------------------------
244
+ st.markdown("### Enter Topics and Descriptions")
245
+ num_topics = st.number_input("Number of topics", min_value=1, max_value=10, value=1, step=1)
246
+
247
+ topics = {}
248
+ for i in range(num_topics):
249
+ topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
250
+ description = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
251
+ if topic and description:
252
+ topics[topic] = description
253
+
254
+ # Add a button to execute the analysis
255
+ if st.button("Run Analysis"):
256
+ if not topics:
257
+ st.warning("Please enter at least one topic and description.")
258
+ st.stop()
259
+
260
+ # ---------------------------------------------------------------------------------------
261
+ # Your existing SurveyAnalysis and extract_excerpts functions remain unchanged here:
262
+ # ---------------------------------------------------------------------------------------
263
+ analyzer = SurveyAnalysis()
264
+ processed_df = analyzer.process_dataframe(df, topics)
265
+ df_VIP_extracted = extract_excerpts(processed_df)
266
+
267
+ required_columns = ['Document_Text', 'Topic_Summary', 'Excerpt', 'Topic']
268
+ missing_columns = [col for col in required_columns if col not in df_VIP_extracted.columns]
269
+
270
+ if missing_columns:
271
+ st.error(f"Missing columns after processing: {missing_columns}")
272
+ st.stop()
273
+
274
+ df_VIP_extracted = df_VIP_extracted[required_columns]
275
+
276
+ st.markdown("### Processed Meeting Notes")
277
+ st.dataframe(df_VIP_extracted)
278
+
279
+ st.write(f"**Number of meeting notes analyzed:** {len(df)}")
280
+ st.write(f"**Number of excerpts extracted:** {len(df_VIP_extracted)}")
281
+
282
+ # CSV download
283
+ csv = df_VIP_extracted.to_csv(index=False)
284
+ st.download_button(
285
+ "Download data as CSV",
286
+ data=csv,
287
+ file_name='extracted_meeting_notes.csv',
288
+ mime='text/csv'
289
+ )
290
+
291
+ # Topic distribution visualization
292
+ topic_counts = df_VIP_extracted['Topic'].value_counts()
293
+ frequency_table = pd.DataFrame({'Topic': topic_counts.index, 'Count': topic_counts.values})
294
+ frequency_table['Percentage'] = (frequency_table['Count'] / frequency_table['Count'].sum() * 100).round(0)
295
+
296
+ st.markdown("### Topic Distribution")
297
+ st.dataframe(frequency_table)
298
+
299
+ fig, ax = plt.subplots(figsize=(10, 5))
300
+ ax.bar(frequency_table['Topic'], frequency_table['Count'], color='#3d9aa1')
301
+ ax.set_ylabel('Count')
302
+ ax.set_title('Frequency of Topics')
303
+ st.pyplot(fig)
304
+
305
+ else:
306
+ st.info("Please upload a PDF file to begin.")