# # ---------------------------------------------------------------------------------------
# # Imports and Options
# # ---------------------------------------------------------------------------------------
# import streamlit as st
# import pandas as pd
# import requests
# import re
# import fitz # PyMuPDF
# import io
# import matplotlib.pyplot as plt
# from PIL import Image
# from transformers import AutoProcessor, AutoModelForVision2Seq
# from docling_core.types.doc import DoclingDocument
# from docling_core.types.doc.document import DocTagsDocument
# import torch
# import os
# from huggingface_hub import InferenceClient
# # ---------------------------------------------------------------------------------------
# # Streamlit Page Configuration
# # ---------------------------------------------------------------------------------------
# st.set_page_config(
# page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
# page_icon=":bar_chart:",
# layout="centered",
# initial_sidebar_state="auto",
# menu_items={
# 'Get Help': 'mailto:support@mtss.ai',
# 'About': "This app is built to support PDF analysis"
# }
# )
# # ---------------------------------------------------------------------------------------
# # Session State Initialization
# # ---------------------------------------------------------------------------------------
# for key in ['pdf_processed', 'markdown_texts', 'df']:
# if key not in st.session_state:
# st.session_state[key] = False if key == 'pdf_processed' else []
# # ---------------------------------------------------------------------------------------
# # API Configuration
# # ---------------------------------------------------------------------------------------
# # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
# # headers = {
# # 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
# # 'Content-Type': 'application/json'
# # }
# # Retrieve Hugging Face API key from environment variables
# hf_api_key = os.getenv('HF_API_KEY')
# if not hf_api_key:
# raise ValueError("HF_API_KEY not set in environment variables")
# # Create the Hugging Face inference client
# client = InferenceClient(api_key=hf_api_key)
# # # ---------------------------------------------------------------------------------------
# # # Survey Analysis Class
# # # ---------------------------------------------------------------------------------------
# # class SurveyAnalysis:
# # def prepare_llm_input(self, survey_response, topics):
# # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
# # return f"""Extract and summarize PDF notes based on topics:
# # {topic_descriptions}
# # Instructions:
# # - Extract exact quotes per topic.
# # - Ignore irrelevant topics.
# # Format:
# # [Topic]
# # - "Exact quote"
# # Meeting Notes:
# # {survey_response}
# # """
# # def query_api(self, payload):
# # try:
# # res = requests.post(API_URL, headers=headers, json=payload, timeout=60)
# # res.raise_for_status()
# # return res.json()
# # except requests.exceptions.RequestException as e:
# # st.error(f"API request failed: {e}")
# # return {'outputs': {'out-0': ''}}
# # def extract_meeting_notes(self, response):
# # return response.get('outputs', {}).get('out-0', '')
# # def process_dataframe(self, df, topics):
# # results = []
# # for _, row in df.iterrows():
# # llm_input = self.prepare_llm_input(row['Document_Text'], topics)
# # payload = {"user_id": "user", "in-0": llm_input}
# # response = self.query_api(payload)
# # notes = self.extract_meeting_notes(response)
# # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
# # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
# # ---------------------------------------------------------------------------------------
# # Survey Analysis Class
# # ---------------------------------------------------------------------------------------
# class SurveyAnalysis:
# def prepare_llm_input(self, survey_response, topics):
# topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
# return f"""Extract and summarize PDF notes based on topics:
# {topic_descriptions}
# Instructions:
# - Extract exact quotes per topic.
# - Ignore irrelevant topics.
# Format:
# [Topic]
# - "Exact quote"
# Meeting Notes:
# {survey_response}
# """
# def prompt_response_from_hf_llm(self, llm_input):
# # Define a system prompt to guide the model's responses
# system_prompt = """
# An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.
# Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.
# Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks.
# Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.
# """
# # Generate the refined prompt using Hugging Face API
# response = client.chat.completions.create(
# model="meta-llama/Llama-3.1-70B-Instruct",
# messages=[
# {"role": "system", "content": system_prompt}, # Add system prompt here
# {"role": "user", "content": llm_input}
# ],
# stream=True,
# temperature=0.5,
# max_tokens=1024,
# top_p=0.7
# )
# # Combine messages if response is streamed
# response_content = ""
# for message in response:
# response_content += message.choices[0].delta.content
# return response_content.strip()
# def extract_text(self, response):
# return response
# def process_dataframe(self, df, topics):
# results = []
# for _, row in df.iterrows():
# llm_input = self.prepare_llm_input(row['Document_Text'], topics)
# response = self.prompt_response_from_hf_llm(llm_input)
# notes = self.extract_text(response)
# results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
# return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
# # ---------------------------------------------------------------------------------------
# # Helper Functions
# # ---------------------------------------------------------------------------------------
# @st.cache_resource
# def load_smol_docling():
# device = "cuda" if torch.cuda.is_available() else "cpu"
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForVision2Seq.from_pretrained(
# "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32
# ).to(device)
# return model, processor
# model, processor = load_smol_docling()
# def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
# images = []
# doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
# for page in doc:
# pix = page.get_pixmap(dpi=dpi)
# img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
# img.thumbnail((max_size, max_size), Image.LANCZOS)
# images.append(img)
# return images
# def extract_markdown_from_image(image):
# device = "cuda" if torch.cuda.is_available() else "cpu"
# prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
# with torch.no_grad():
# generated_ids = model.generate(**inputs, max_new_tokens=1024)
# doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("", "").strip()
# doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
# doc = DoclingDocument(name="ExtractedDocument")
# doc.load_from_doctags(doctags_doc)
# return doc.export_to_markdown()
# def extract_excerpts(processed_df):
# rows = []
# for _, r in processed_df.iterrows():
# for sec in re.split(r'\n(?=\[)', r['Topic_Summary']):
# topic_match = re.match(r'\[([^\]]+)\]', sec)
# if topic_match:
# topic = topic_match.group(1)
# excerpts = re.findall(r'- "([^"]+)"', sec)
# for excerpt in excerpts:
# rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic})
# return pd.DataFrame(rows)
# # ---------------------------------------------------------------------------------------
# # Streamlit UI
# # ---------------------------------------------------------------------------------------
# st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
# uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
# if uploaded_file and not st.session_state['pdf_processed']:
# with st.spinner("Processing PDF..."):
# images = convert_pdf_to_images(uploaded_file)
# markdown_texts = [extract_markdown_from_image(img) for img in images]
# st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts})
# st.session_state['pdf_processed'] = True
# st.success("PDF processed successfully!")
# if st.session_state['pdf_processed']:
# st.markdown("### Extracted Text Preview")
# st.write(st.session_state['df'].head())
# st.markdown("### Enter Topics and Descriptions")
# num_topics = st.number_input("Number of topics", 1, 10, 1)
# topics = {}
# for i in range(num_topics):
# topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
# desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
# if topic and desc:
# topics[topic] = desc
# if st.button("Run Analysis"):
# if not topics:
# st.warning("Please enter at least one topic and description.")
# st.stop()
# analyzer = SurveyAnalysis()
# processed_df = analyzer.process_dataframe(st.session_state['df'], topics)
# extracted_df = extract_excerpts(processed_df)
# st.markdown("### Extracted Excerpts")
# st.dataframe(extracted_df)
# csv = extracted_df.to_csv(index=False)
# st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
# topic_counts = extracted_df['Topic'].value_counts()
# fig, ax = plt.subplots()
# topic_counts.plot.bar(ax=ax, color='#3d9aa1')
# st.pyplot(fig)
# if not uploaded_file:
# st.info("Please upload a PDF file to begin.")
# ---------------------------------------------------------------------------------------
# Imports and Options
# ---------------------------------------------------------------------------------------
import streamlit as st
import pandas as pd
import requests
import re
import fitz # PyMuPDF
import io
import matplotlib.pyplot as plt
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
import torch
import os
from huggingface_hub import InferenceClient
# ---------------------------------------------------------------------------------------
# Streamlit Page Configuration
# ---------------------------------------------------------------------------------------
st.set_page_config(
page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
page_icon=":bar_chart:",
layout="centered",
initial_sidebar_state="auto",
menu_items={
'Get Help': 'mailto:support@mtss.ai',
'About': "This app is built to support PDF analysis"
}
)
# ---------------------------------------------------------------------------------------
# Session State Initialization
# ---------------------------------------------------------------------------------------
for key in ['pdf_processed', 'markdown_texts', 'df']:
if key not in st.session_state:
st.session_state[key] = False if key == 'pdf_processed' else []
# ---------------------------------------------------------------------------------------
# API Configuration
# ---------------------------------------------------------------------------------------
# Retrieve Hugging Face API key from environment variables
hf_api_key = os.getenv('HF_API_KEY')
if not hf_api_key:
raise ValueError("HF_API_KEY not set in environment variables")
# Create the Hugging Face inference client
client = InferenceClient(api_key=hf_api_key)
# ---------------------------------------------------------------------------------------
# Survey Analysis Class
# ---------------------------------------------------------------------------------------
class SurveyAnalysis:
def prepare_llm_input(self, survey_response, topics):
topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
return f"""Extract and summarize PDF notes based on topics:
{topic_descriptions}
Instructions:
- Extract exact quotes per topic.
- Ignore irrelevant topics.
Format:
[Topic]
- "Exact quote"
Meeting Notes:
{survey_response}
"""
def prompt_response_from_hf_llm(self, llm_input):
# Define a system prompt to guide the model's responses
system_prompt = """
An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.
Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.
Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks.
Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.
"""
# Generate the refined prompt using Hugging Face API
response = client.chat.completions.create(
model="meta-llama/Llama-3.1-70B-Instruct",
messages=[
{"role": "system", "content": system_prompt}, # Add system prompt here
{"role": "user", "content": llm_input}
],
stream=True,
temperature=0.5,
max_tokens=1024,
top_p=0.7
)
# Combine messages if response is streamed
response_content = ""
for message in response:
response_content += message.choices[0].delta.content
return response_content.strip()
def extract_text(self, response):
return response
def process_dataframe(self, df, topics):
results = []
for _, row in df.iterrows():
llm_input = self.prepare_llm_input(row['Document_Text'], topics)
response = self.prompt_response_from_hf_llm(llm_input)
print("AI Response:", response) # Debugging: print the AI response
notes = self.extract_text(response)
results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
# ---------------------------------------------------------------------------------------
# Helper Functions
# ---------------------------------------------------------------------------------------
@st.cache_resource
def load_smol_docling():
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForVision2Seq.from_pretrained(
"ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32
).to(device)
return model, processor
model, processor = load_smol_docling()
def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
images = []
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
for page in doc:
pix = page.get_pixmap(dpi=dpi)
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
img.thumbnail((max_size, max_size), Image.LANCZOS)
images.append(img)
return images
def extract_markdown_from_image(image):
device = "cuda" if torch.cuda.is_available() else "cpu"
prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=1024)
doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("", "").strip()
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
doc = DoclingDocument(name="ExtractedDocument")
doc.load_from_doctags(doctags_doc)
return doc.export_to_markdown()
def extract_excerpts(processed_df):
rows = []
for _, r in processed_df.iterrows():
for sec in re.split(r'\n(?=\[)', r['Topic_Summary']):
topic_match = re.match(r'\[([^\]]+)\]', sec)
if topic_match:
topic = topic_match.group(1)
excerpts = re.findall(r'- "([^"]+)"', sec)
for excerpt in excerpts:
rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic})
print("Extracted Rows:", rows) # Debugging: print extracted rows
return pd.DataFrame(rows)
# ---------------------------------------------------------------------------------------
# Streamlit UI
# ---------------------------------------------------------------------------------------
st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
if uploaded_file and not st.session_state['pdf_processed']:
with st.spinner("Processing PDF..."):
images = convert_pdf_to_images(uploaded_file)
markdown_texts = [extract_markdown_from_image(img) for img in images]
st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts})
st.session_state['pdf_processed'] = True
st.success("PDF processed successfully!")
if st.session_state['pdf_processed']:
st.markdown("### Extracted Text Preview")
st.write(st.session_state['df'].head())
st.markdown("### Enter Topics and Descriptions")
num_topics = st.number_input("Number of topics", 1, 10, 1)
topics = {}
for i in range(num_topics):
topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
if topic and desc:
topics[topic] = desc
if st.button("Run Analysis"):
if not topics:
st.warning("Please enter at least one topic and description.")
st.stop()
analyzer = SurveyAnalysis()
processed_df = analyzer.process_dataframe(st.session_state['df'], topics)
extracted_df = extract_excerpts(processed_df)
st.markdown("### Extracted Excerpts")
st.dataframe(extracted_df)
csv = extracted_df.to_csv(index=False)
st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
if not extracted_df.empty:
topic_counts = extracted_df['Topic'].value_counts()
fig, ax = plt.subplots()
topic_counts.plot.bar(ax=ax, color='#3d9aa1')
st.pyplot(fig)
else:
st.warning("No topics were extracted. Please check the input data and topics.")
if not uploaded_file:
st.info("Please upload a PDF file to begin.")