ProfessorLeVesseur commited on
Commit
56f8447
·
verified ·
1 Parent(s): c654aff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +352 -27
app.py CHANGED
@@ -1,3 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # ---------------------------------------------------------------------------------------
2
  # Imports and Options
3
  # ---------------------------------------------------------------------------------------
@@ -9,13 +315,10 @@ import fitz # PyMuPDF
9
  import io
10
  import matplotlib.pyplot as plt
11
  from PIL import Image
12
- from mlx_vlm import load, generate
13
- from mlx_vlm.prompt_utils import apply_chat_template
14
- from mlx_vlm.utils import load_config, stream_generate
15
- from docling_core.types.doc.document import DocTagsDocument, DoclingDocument
16
-
17
- # Set Streamlit to wide mode
18
- # st.set_page_config(layout="wide")
19
 
20
  # ---------------------------------------------------------------------------------------
21
  # API Configuration
@@ -168,15 +471,18 @@ with st.sidebar:
168
  """
169
  st.markdown(Instructions)
170
 
171
- # Load SmolDocling model (mlx_vlm version)
172
  @st.cache_resource
173
  def load_smol_docling():
174
- model_path = "ds4sd/SmolDocling-256M-preview"
175
- model, processor = load(model_path)
176
- config = load_config(model_path)
177
- return model, processor, config
 
 
 
178
 
179
- model, processor, config = load_smol_docling()
180
 
181
  # Convert PDF to images
182
  def convert_pdf_to_images(pdf_file):
@@ -190,22 +496,41 @@ def convert_pdf_to_images(pdf_file):
190
  images.append(image)
191
  return images
192
 
193
- # Extract structured markdown text using SmolDocling (mlx_vlm)
194
  def extract_markdown_from_image(image):
195
- prompt = "Convert this page to docling."
196
- formatted_prompt = apply_chat_template(processor, config, prompt, num_images=1)
197
- output = ""
198
-
199
- for token in stream_generate(
200
- model, processor, formatted_prompt, [image], max_tokens=4096, verbose=False):
201
- output += token.text
202
- if "</doctag>" in token.text:
203
- break
204
-
205
- # Convert DocTags to Markdown
206
- doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([output], [image])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  doc = DoclingDocument(name="ExtractedDocument")
208
  doc.load_from_doctags(doctags_doc)
 
 
209
  markdown_text = doc.export_to_markdown()
210
  return markdown_text
211
 
@@ -300,4 +625,4 @@ if uploaded_file:
300
  st.pyplot(fig)
301
 
302
  else:
303
- st.info("Please upload a PDF file to begin.")
 
1
+ # # ---------------------------------------------------------------------------------------
2
+ # # Imports and Options
3
+ # # ---------------------------------------------------------------------------------------
4
+ # import streamlit as st
5
+ # import pandas as pd
6
+ # import requests
7
+ # import re
8
+ # import fitz # PyMuPDF
9
+ # import io
10
+ # import matplotlib.pyplot as plt
11
+ # from PIL import Image
12
+ # from mlx_vlm import load, generate
13
+ # from mlx_vlm.prompt_utils import apply_chat_template
14
+ # from mlx_vlm.utils import load_config, stream_generate
15
+ # from docling_core.types.doc.document import DocTagsDocument, DoclingDocument
16
+
17
+ # # Set Streamlit to wide mode
18
+ # # st.set_page_config(layout="wide")
19
+
20
+ # # ---------------------------------------------------------------------------------------
21
+ # # API Configuration
22
+ # # ---------------------------------------------------------------------------------------
23
+ # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
24
+ # headers = {
25
+ # 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
26
+ # 'Content-Type': 'application/json'
27
+ # }
28
+
29
+ # # ---------------------------------------------------------------------------------------
30
+ # # Survey Analysis Class
31
+ # # ---------------------------------------------------------------------------------------
32
+ # class SurveyAnalysis:
33
+ # def __init__(self, api_key=None):
34
+ # self.api_key = api_key
35
+
36
+ # def prepare_llm_input(self, survey_response, topics):
37
+ # # Create topic description string from user input
38
+ # topic_descriptions = "\n".join([f"- **{topic}**: {description}" for topic, description in topics.items()])
39
+
40
+ # llm_input = f"""
41
+ # Your task is to review PDF docling and extract information related to the provided topics. Here are the topic descriptions:
42
+
43
+ # {topic_descriptions}
44
+
45
+ # **Instructions:**
46
+ # - Extract and summarize the PDF focusing only on the provided topics.
47
+ # - If a topic is not mentioned in the notes, it should not be included in the Topic_Summary.
48
+ # - Use **exact quotes** from the original text for each point in your Topic_Summary.
49
+ # - Exclude erroneous content.
50
+ # - Do not add additional explanations or instructions.
51
+
52
+ # **Format your response as follows:**
53
+ # [Topic]
54
+ # - "Exact quote"
55
+ # - "Exact quote"
56
+ # - "Exact quote"
57
+
58
+ # **Meeting Notes:**
59
+ # {survey_response}
60
+ # """
61
+ # return llm_input
62
+
63
+ # def query_api(self, payload):
64
+ # response = requests.post(API_URL, headers=headers, json=payload)
65
+ # return response.json()
66
+
67
+ # def extract_meeting_notes(self, response):
68
+ # output = response.get('outputs', {}).get('out-0', '')
69
+ # return output
70
+
71
+ # def process_dataframe(self, df, topics):
72
+ # results = []
73
+ # for _, row in df.iterrows():
74
+ # llm_input = self.prepare_llm_input(row['Document_Text'], topics)
75
+ # payload = {
76
+ # "user_id": "<USER or Conversation ID>",
77
+ # "in-0": llm_input
78
+ # }
79
+ # response = self.query_api(payload)
80
+ # meeting_notes = self.extract_meeting_notes(response)
81
+ # results.append({
82
+ # 'Document_Text': row['Document_Text'],
83
+ # 'Topic_Summary': meeting_notes
84
+ # })
85
+
86
+ # result_df = pd.DataFrame(results)
87
+ # df = df.reset_index(drop=True)
88
+ # return pd.concat([df, result_df[['Topic_Summary']]], axis=1)
89
+
90
+ # # ---------------------------------------------------------------------------------------
91
+ # # Function to Extract Excerpts
92
+ # # ---------------------------------------------------------------------------------------
93
+ # def extract_excerpts(processed_df):
94
+ # new_rows = []
95
+
96
+ # for _, row in processed_df.iterrows():
97
+ # Topic_Summary = row['Topic_Summary']
98
+
99
+ # # Split the Topic_Summary by topic
100
+ # sections = re.split(r'\n(?=\[)', Topic_Summary)
101
+
102
+ # for section in sections:
103
+ # # Extract the topic
104
+ # topic_match = re.match(r'\[([^\]]+)\]', section)
105
+ # if topic_match:
106
+ # topic = topic_match.group(1)
107
+
108
+ # # Extract all excerpts within the section
109
+ # excerpts = re.findall(r'- "([^"]+)"', section)
110
+
111
+ # for excerpt in excerpts:
112
+ # new_rows.append({
113
+ # 'Document_Text': row['Document_Text'],
114
+ # 'Topic_Summary': row['Topic_Summary'],
115
+ # 'Excerpt': excerpt,
116
+ # 'Topic': topic
117
+ # })
118
+
119
+ # return pd.DataFrame(new_rows)
120
+
121
+ # #------------------------------------------------------------------------
122
+ # # Streamlit Configuration
123
+ # #------------------------------------------------------------------------
124
+
125
+ # # Set page configuration
126
+ # st.set_page_config(
127
+ # page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
128
+ # page_icon=":bar_chart:",
129
+ # layout="centered",
130
+ # initial_sidebar_state="auto",
131
+ # menu_items={
132
+ # 'Get Help': 'mailto:[email protected]',
133
+ # 'About': "This app is built to support PDF analysis"
134
+ # }
135
+ # )
136
+
137
+ # #------------------------------------------------------------------------
138
+ # # Sidebar
139
+ # #------------------------------------------------------------------------
140
+
141
+ # # Sidebar with image
142
+ # with st.sidebar:
143
+ # # Set the desired width in pixels
144
+ # image_width = 300
145
+ # # Define the path to the image
146
+ # # image_path = "steelcase_small.png"
147
+ # image_path = "mtss.ai_small.png"
148
+ # # Display the image
149
+ # st.image(image_path, width=image_width)
150
+
151
+ # # Additional sidebar content
152
+
153
+ # with st.expander("**MTSS.ai**", expanded=True):
154
+ # st.write("""
155
+ # - **Support**: Cheyne LeVesseur PhD
156
+ # - **Email**: [email protected]
157
+ # """)
158
+ # st.divider()
159
+ # st.subheader('Instructions')
160
+
161
+ # Instructions = """
162
+ # - **Step 1**: Upload your PDF file.
163
+ # - **Step 2**: Review the processed text.
164
+ # - **Step 3**: Add your topics and descriptions of interest.
165
+ # - **Step 4**: Review the extracted excerpts and classifications, and topic distribution and frequency.
166
+ # - **Step 5**: Review bar charts of topics.
167
+ # - **Step 6**: Download the processed data as a CSV file.
168
+ # """
169
+ # st.markdown(Instructions)
170
+
171
+ # # Load SmolDocling model ()
172
+ # @st.cache_resource
173
+ # def load_smol_docling():
174
+ # model_path = "ds4sd/SmolDocling-256M-preview"
175
+ # model, processor = load(model_path)
176
+ # config = load_config(model_path)
177
+ # return model, processor, config
178
+
179
+ # model, processor, config = load_smol_docling()
180
+
181
+ # # Convert PDF to images
182
+ # def convert_pdf_to_images(pdf_file):
183
+ # images = []
184
+ # doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
185
+ # for page_number in range(len(doc)):
186
+ # page = doc.load_page(page_number)
187
+ # pix = page.get_pixmap(dpi=300) # Higher DPI for clarity
188
+ # img_data = pix.tobytes("png")
189
+ # image = Image.open(io.BytesIO(img_data))
190
+ # images.append(image)
191
+ # return images
192
+
193
+ # # Extract structured markdown text using SmolDocling (mlx_vlm)
194
+ # def extract_markdown_from_image(image):
195
+ # prompt = "Convert this page to docling."
196
+ # formatted_prompt = apply_chat_template(processor, config, prompt, num_images=1)
197
+ # output = ""
198
+
199
+ # for token in stream_generate(
200
+ # model, processor, formatted_prompt, [image], max_tokens=4096, verbose=False):
201
+ # output += token.text
202
+ # if "</doctag>" in token.text:
203
+ # break
204
+
205
+ # # Convert DocTags to Markdown
206
+ # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([output], [image])
207
+ # doc = DoclingDocument(name="ExtractedDocument")
208
+ # doc.load_from_doctags(doctags_doc)
209
+ # markdown_text = doc.export_to_markdown()
210
+ # return markdown_text
211
+
212
+ # # Streamlit UI
213
+ # st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
214
+
215
+ # uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
216
+
217
+ # if uploaded_file:
218
+ # with st.spinner("Processing PDF..."):
219
+ # images = convert_pdf_to_images(uploaded_file)
220
+
221
+ # markdown_texts = []
222
+ # for idx, image in enumerate(images):
223
+ # markdown_text = extract_markdown_from_image(image)
224
+ # markdown_texts.append(markdown_text)
225
+
226
+ # df = pd.DataFrame({'Document_Text': markdown_texts})
227
+
228
+ # st.success("PDF processed successfully!")
229
+
230
+ # # Check if extraction was successful
231
+ # if df.empty or df['Document_Text'].isnull().all():
232
+ # st.error("No meaningful text extracted from the PDF.")
233
+ # st.stop()
234
+
235
+ # st.markdown("### Extracted Markdown Preview")
236
+ # st.write(df.head())
237
+
238
+ # # ---------------------------------------------------------------------------------------
239
+ # # User Input for Topics
240
+ # # ---------------------------------------------------------------------------------------
241
+ # st.markdown("### Enter Topics and Descriptions")
242
+ # num_topics = st.number_input("Number of topics", min_value=1, max_value=10, value=1, step=1)
243
+
244
+ # topics = {}
245
+ # for i in range(num_topics):
246
+ # topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
247
+ # description = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
248
+ # if topic and description:
249
+ # topics[topic] = description
250
+
251
+ # # Add a button to execute the analysis
252
+ # if st.button("Run Analysis"):
253
+ # if not topics:
254
+ # st.warning("Please enter at least one topic and description.")
255
+ # st.stop()
256
+
257
+ # # ---------------------------------------------------------------------------------------
258
+ # # Your existing SurveyAnalysis and extract_excerpts functions remain unchanged here:
259
+ # # ---------------------------------------------------------------------------------------
260
+ # analyzer = SurveyAnalysis()
261
+ # processed_df = analyzer.process_dataframe(df, topics)
262
+ # df_VIP_extracted = extract_excerpts(processed_df)
263
+
264
+ # required_columns = ['Document_Text', 'Topic_Summary', 'Excerpt', 'Topic']
265
+ # missing_columns = [col for col in required_columns if col not in df_VIP_extracted.columns]
266
+
267
+ # if missing_columns:
268
+ # st.error(f"Missing columns after processing: {missing_columns}")
269
+ # st.stop()
270
+
271
+ # df_VIP_extracted = df_VIP_extracted[required_columns]
272
+
273
+ # st.markdown("### Processed Meeting Notes")
274
+ # st.dataframe(df_VIP_extracted)
275
+
276
+ # st.write(f"**Number of meeting notes analyzed:** {len(df)}")
277
+ # st.write(f"**Number of excerpts extracted:** {len(df_VIP_extracted)}")
278
+
279
+ # # CSV download
280
+ # csv = df_VIP_extracted.to_csv(index=False)
281
+ # st.download_button(
282
+ # "Download data as CSV",
283
+ # data=csv,
284
+ # file_name='extracted_meeting_notes.csv',
285
+ # mime='text/csv'
286
+ # )
287
+
288
+ # # Topic distribution visualization
289
+ # topic_counts = df_VIP_extracted['Topic'].value_counts()
290
+ # frequency_table = pd.DataFrame({'Topic': topic_counts.index, 'Count': topic_counts.values})
291
+ # frequency_table['Percentage'] = (frequency_table['Count'] / frequency_table['Count'].sum() * 100).round(0)
292
+
293
+ # st.markdown("### Topic Distribution")
294
+ # st.dataframe(frequency_table)
295
+
296
+ # fig, ax = plt.subplots(figsize=(10, 5))
297
+ # ax.bar(frequency_table['Topic'], frequency_table['Count'], color='#3d9aa1')
298
+ # ax.set_ylabel('Count')
299
+ # ax.set_title('Frequency of Topics')
300
+ # st.pyplot(fig)
301
+
302
+ # else:
303
+ # st.info("Please upload a PDF file to begin.")
304
+
305
+
306
+
307
  # ---------------------------------------------------------------------------------------
308
  # Imports and Options
309
  # ---------------------------------------------------------------------------------------
 
315
  import io
316
  import matplotlib.pyplot as plt
317
  from PIL import Image
318
+ from transformers import AutoProcessor, AutoModelForVision2Seq
319
+ from docling_core.types.doc import DoclingDocument
320
+ from docling_core.types.doc.document import DocTagsDocument
321
+ import torch
 
 
 
322
 
323
  # ---------------------------------------------------------------------------------------
324
  # API Configuration
 
471
  """
472
  st.markdown(Instructions)
473
 
474
+ # Load SmolDocling model using transformers
475
  @st.cache_resource
476
  def load_smol_docling():
477
+ device = "cuda" if torch.cuda.is_available() else "cpu"
478
+ processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
479
+ model = AutoModelForVision2Seq.from_pretrained(
480
+ "ds4sd/SmolDocling-256M-preview",
481
+ torch_dtype=torch.float32
482
+ ).to(device)
483
+ return model, processor
484
 
485
+ model, processor = load_smol_docling()
486
 
487
  # Convert PDF to images
488
  def convert_pdf_to_images(pdf_file):
 
496
  images.append(image)
497
  return images
498
 
499
+ # Extract structured markdown text using SmolDocling (transformers)
500
  def extract_markdown_from_image(image):
501
+ prompt_text = "Convert this page to docling."
502
+ device = "cuda" if torch.cuda.is_available() else "cpu"
503
+
504
+ # Prepare inputs
505
+ messages = [
506
+ {
507
+ "role": "user",
508
+ "content": [
509
+ {"type": "image"},
510
+ {"type": "text", "text": prompt_text}
511
+ ]
512
+ }
513
+ ]
514
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
515
+ inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
516
+
517
+ # Generate outputs
518
+ generated_ids = model.generate(**inputs, max_new_tokens=1024)
519
+ prompt_length = inputs.input_ids.shape[1]
520
+ trimmed_generated_ids = generated_ids[:, prompt_length:]
521
+ doctags = processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
522
+
523
+ # Clean the output
524
+ doctags = doctags.replace("<end_of_utterance>", "").strip()
525
+
526
+ # Populate document
527
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
528
+
529
+ # Create a docling document
530
  doc = DoclingDocument(name="ExtractedDocument")
531
  doc.load_from_doctags(doctags_doc)
532
+
533
+ # Export as markdown
534
  markdown_text = doc.export_to_markdown()
535
  return markdown_text
536
 
 
625
  st.pyplot(fig)
626
 
627
  else:
628
+ st.info("Please upload a PDF file to begin.")