# # --------------------------------------------------------------------------------------- # # Imports and Options # # --------------------------------------------------------------------------------------- # import streamlit as st # import pandas as pd # import requests # import re # import fitz # PyMuPDF # import io # import matplotlib.pyplot as plt # from PIL import Image # from transformers import AutoProcessor, AutoModelForVision2Seq # from docling_core.types.doc import DoclingDocument # from docling_core.types.doc.document import DocTagsDocument # import torch # import os # from huggingface_hub import InferenceClient # # --------------------------------------------------------------------------------------- # # Streamlit Page Configuration # # --------------------------------------------------------------------------------------- # st.set_page_config( # page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App", # page_icon=":bar_chart:", # layout="centered", # initial_sidebar_state="auto", # menu_items={ # 'Get Help': 'mailto:support@mtss.ai', # 'About': "This app is built to support PDF analysis" # } # ) # # --------------------------------------------------------------------------------------- # # Session State Initialization # # --------------------------------------------------------------------------------------- # for key in ['pdf_processed', 'markdown_texts', 'df']: # if key not in st.session_state: # st.session_state[key] = False if key == 'pdf_processed' else [] # # --------------------------------------------------------------------------------------- # # API Configuration # # --------------------------------------------------------------------------------------- # # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7" # # headers = { # # 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805', # # 'Content-Type': 'application/json' # # } # # Retrieve Hugging Face API key from environment variables # hf_api_key = os.getenv('HF_API_KEY') # if not hf_api_key: # raise ValueError("HF_API_KEY not set in environment variables") # # Create the Hugging Face inference client # client = InferenceClient(api_key=hf_api_key) # # # --------------------------------------------------------------------------------------- # # # Survey Analysis Class # # # --------------------------------------------------------------------------------------- # # class SurveyAnalysis: # # def prepare_llm_input(self, survey_response, topics): # # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()]) # # return f"""Extract and summarize PDF notes based on topics: # # {topic_descriptions} # # Instructions: # # - Extract exact quotes per topic. # # - Ignore irrelevant topics. # # Format: # # [Topic] # # - "Exact quote" # # Meeting Notes: # # {survey_response} # # """ # # def query_api(self, payload): # # try: # # res = requests.post(API_URL, headers=headers, json=payload, timeout=60) # # res.raise_for_status() # # return res.json() # # except requests.exceptions.RequestException as e: # # st.error(f"API request failed: {e}") # # return {'outputs': {'out-0': ''}} # # def extract_meeting_notes(self, response): # # return response.get('outputs', {}).get('out-0', '') # # def process_dataframe(self, df, topics): # # results = [] # # for _, row in df.iterrows(): # # llm_input = self.prepare_llm_input(row['Document_Text'], topics) # # payload = {"user_id": "user", "in-0": llm_input} # # response = self.query_api(payload) # # notes = self.extract_meeting_notes(response) # # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes}) # # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1) # # --------------------------------------------------------------------------------------- # # Survey Analysis Class # # --------------------------------------------------------------------------------------- # class SurveyAnalysis: # def prepare_llm_input(self, survey_response, topics): # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()]) # return f"""Extract and summarize PDF notes based on topics: # {topic_descriptions} # Instructions: # - Extract exact quotes per topic. # - Ignore irrelevant topics. # Format: # [Topic] # - "Exact quote" # Meeting Notes: # {survey_response} # """ # def prompt_response_from_hf_llm(self, llm_input): # # Define a system prompt to guide the model's responses # system_prompt = """ # An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices. # Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement. # Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. # Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation. # """ # # Generate the refined prompt using Hugging Face API # response = client.chat.completions.create( # model="meta-llama/Llama-3.1-70B-Instruct", # messages=[ # {"role": "system", "content": system_prompt}, # Add system prompt here # {"role": "user", "content": llm_input} # ], # stream=True, # temperature=0.5, # max_tokens=1024, # top_p=0.7 # ) # # Combine messages if response is streamed # response_content = "" # for message in response: # response_content += message.choices[0].delta.content # return response_content.strip() # def extract_text(self, response): # return response # def process_dataframe(self, df, topics): # results = [] # for _, row in df.iterrows(): # llm_input = self.prepare_llm_input(row['Document_Text'], topics) # response = self.prompt_response_from_hf_llm(llm_input) # notes = self.extract_text(response) # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes}) # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1) # # --------------------------------------------------------------------------------------- # # Helper Functions # # --------------------------------------------------------------------------------------- # @st.cache_resource # def load_smol_docling(): # device = "cuda" if torch.cuda.is_available() else "cpu" # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") # model = AutoModelForVision2Seq.from_pretrained( # "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32 # ).to(device) # return model, processor # model, processor = load_smol_docling() # def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600): # images = [] # doc = fitz.open(stream=pdf_file.read(), filetype="pdf") # for page in doc: # pix = page.get_pixmap(dpi=dpi) # img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") # img.thumbnail((max_size, max_size), Image.LANCZOS) # images.append(img) # return images # def extract_markdown_from_image(image): # device = "cuda" if torch.cuda.is_available() else "cpu" # prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True) # inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device) # with torch.no_grad(): # generated_ids = model.generate(**inputs, max_new_tokens=1024) # doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("", "").strip() # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) # doc = DoclingDocument(name="ExtractedDocument") # doc.load_from_doctags(doctags_doc) # return doc.export_to_markdown() # def extract_excerpts(processed_df): # rows = [] # for _, r in processed_df.iterrows(): # for sec in re.split(r'\n(?=\[)', r['Topic_Summary']): # topic_match = re.match(r'\[([^\]]+)\]', sec) # if topic_match: # topic = topic_match.group(1) # excerpts = re.findall(r'- "([^"]+)"', sec) # for excerpt in excerpts: # rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic}) # return pd.DataFrame(rows) # # --------------------------------------------------------------------------------------- # # Streamlit UI # # --------------------------------------------------------------------------------------- # st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App") # uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) # if uploaded_file and not st.session_state['pdf_processed']: # with st.spinner("Processing PDF..."): # images = convert_pdf_to_images(uploaded_file) # markdown_texts = [extract_markdown_from_image(img) for img in images] # st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts}) # st.session_state['pdf_processed'] = True # st.success("PDF processed successfully!") # if st.session_state['pdf_processed']: # st.markdown("### Extracted Text Preview") # st.write(st.session_state['df'].head()) # st.markdown("### Enter Topics and Descriptions") # num_topics = st.number_input("Number of topics", 1, 10, 1) # topics = {} # for i in range(num_topics): # topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}") # desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}") # if topic and desc: # topics[topic] = desc # if st.button("Run Analysis"): # if not topics: # st.warning("Please enter at least one topic and description.") # st.stop() # analyzer = SurveyAnalysis() # processed_df = analyzer.process_dataframe(st.session_state['df'], topics) # extracted_df = extract_excerpts(processed_df) # st.markdown("### Extracted Excerpts") # st.dataframe(extracted_df) # csv = extracted_df.to_csv(index=False) # st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv") # topic_counts = extracted_df['Topic'].value_counts() # fig, ax = plt.subplots() # topic_counts.plot.bar(ax=ax, color='#3d9aa1') # st.pyplot(fig) # if not uploaded_file: # st.info("Please upload a PDF file to begin.") # --------------------------------------------------------------------------------------- # Imports and Options # --------------------------------------------------------------------------------------- import streamlit as st import pandas as pd import requests import re import fitz # PyMuPDF import io import matplotlib.pyplot as plt from PIL import Image from transformers import AutoProcessor, AutoModelForVision2Seq from docling_core.types.doc import DoclingDocument from docling_core.types.doc.document import DocTagsDocument import torch import os from huggingface_hub import InferenceClient # --------------------------------------------------------------------------------------- # Streamlit Page Configuration # --------------------------------------------------------------------------------------- st.set_page_config( page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App", page_icon=":bar_chart:", layout="centered", initial_sidebar_state="auto", menu_items={ 'Get Help': 'mailto:support@mtss.ai', 'About': "This app is built to support PDF analysis" } ) # --------------------------------------------------------------------------------------- # Session State Initialization # --------------------------------------------------------------------------------------- for key in ['pdf_processed', 'markdown_texts', 'df']: if key not in st.session_state: st.session_state[key] = False if key == 'pdf_processed' else [] # --------------------------------------------------------------------------------------- # API Configuration # --------------------------------------------------------------------------------------- # Retrieve Hugging Face API key from environment variables hf_api_key = os.getenv('HF_API_KEY') if not hf_api_key: raise ValueError("HF_API_KEY not set in environment variables") # Create the Hugging Face inference client client = InferenceClient(api_key=hf_api_key) # --------------------------------------------------------------------------------------- # Survey Analysis Class # --------------------------------------------------------------------------------------- class SurveyAnalysis: def prepare_llm_input(self, survey_response, topics): topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()]) return f"""Extract and summarize PDF notes based on topics: {topic_descriptions} Instructions: - Extract exact quotes per topic. - Ignore irrelevant topics. Format: [Topic] - "Exact quote" Meeting Notes: {survey_response} """ def prompt_response_from_hf_llm(self, llm_input): # Define a system prompt to guide the model's responses system_prompt = """ An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices. Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement. Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation. """ # Generate the refined prompt using Hugging Face API response = client.chat.completions.create( model="meta-llama/Llama-3.1-70B-Instruct", messages=[ {"role": "system", "content": system_prompt}, # Add system prompt here {"role": "user", "content": llm_input} ], stream=True, temperature=0.5, max_tokens=1024, top_p=0.7 ) # Combine messages if response is streamed response_content = "" for message in response: response_content += message.choices[0].delta.content return response_content.strip() def extract_text(self, response): return response def process_dataframe(self, df, topics): results = [] for _, row in df.iterrows(): llm_input = self.prepare_llm_input(row['Document_Text'], topics) response = self.prompt_response_from_hf_llm(llm_input) print("AI Response:", response) # Debugging: print the AI response notes = self.extract_text(response) results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes}) return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1) # --------------------------------------------------------------------------------------- # Helper Functions # --------------------------------------------------------------------------------------- @st.cache_resource def load_smol_docling(): device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForVision2Seq.from_pretrained( "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32 ).to(device) return model, processor model, processor = load_smol_docling() def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600): images = [] doc = fitz.open(stream=pdf_file.read(), filetype="pdf") for page in doc: pix = page.get_pixmap(dpi=dpi) img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") img.thumbnail((max_size, max_size), Image.LANCZOS) images.append(img) return images def extract_markdown_from_image(image): device = "cuda" if torch.cuda.is_available() else "cpu" prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device) with torch.no_grad(): generated_ids = model.generate(**inputs, max_new_tokens=1024) doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("", "").strip() doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) doc = DoclingDocument(name="ExtractedDocument") doc.load_from_doctags(doctags_doc) return doc.export_to_markdown() def extract_excerpts(processed_df): rows = [] for _, r in processed_df.iterrows(): for sec in re.split(r'\n(?=\[)', r['Topic_Summary']): topic_match = re.match(r'\[([^\]]+)\]', sec) if topic_match: topic = topic_match.group(1) excerpts = re.findall(r'- "([^"]+)"', sec) for excerpt in excerpts: rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic}) print("Extracted Rows:", rows) # Debugging: print extracted rows return pd.DataFrame(rows) # --------------------------------------------------------------------------------------- # Streamlit UI # --------------------------------------------------------------------------------------- st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App") uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) if uploaded_file and not st.session_state['pdf_processed']: with st.spinner("Processing PDF..."): images = convert_pdf_to_images(uploaded_file) markdown_texts = [extract_markdown_from_image(img) for img in images] st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts}) st.session_state['pdf_processed'] = True st.success("PDF processed successfully!") if st.session_state['pdf_processed']: st.markdown("### Extracted Text Preview") st.write(st.session_state['df'].head()) st.markdown("### Enter Topics and Descriptions") num_topics = st.number_input("Number of topics", 1, 10, 1) topics = {} for i in range(num_topics): topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}") desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}") if topic and desc: topics[topic] = desc if st.button("Run Analysis"): if not topics: st.warning("Please enter at least one topic and description.") st.stop() analyzer = SurveyAnalysis() processed_df = analyzer.process_dataframe(st.session_state['df'], topics) extracted_df = extract_excerpts(processed_df) st.markdown("### Extracted Excerpts") st.dataframe(extracted_df) csv = extracted_df.to_csv(index=False) st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv") if not extracted_df.empty: topic_counts = extracted_df['Topic'].value_counts() fig, ax = plt.subplots() topic_counts.plot.bar(ax=ax, color='#3d9aa1') st.pyplot(fig) else: st.warning("No topics were extracted. Please check the input data and topics.") if not uploaded_file: st.info("Please upload a PDF file to begin.")