Kaiyeee's picture
Update app.py
bb07df1 verified
raw
history blame contribute delete
6 kB
import re
import nltk
import spacy
import fitz # PyMuPDF
from transformers import pipeline
import textwrap
import gradio as gr
import spacy
# Download NLTK punkt if not already done
nltk.download('punkt')
nltk.download('punkt_tab')
# Load spaCy model
try:
nlp = spacy.load("en_core_web_sm")
except Exception as e:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
# Initialize the BigBird-Pegasus summarization pipeline for PubMed texts
summarizer = pipeline("summarization", model="google/bigbird-pegasus-large-pubmed")
# Helper Function: Read PDF with Content Filter
def read_pdf_with_content_filter(file_path, keywords=["Abstract", "Introduction", "Methods", "Results", "Conclusions"]):
"""
Reads a PDF file and returns text only from pages that contain one of the specified keywords.
This helps exclude pages that mainly contain header/metadata.
"""
doc = fitz.open(file_path)
content_pages = []
for i in range(len(doc)):
page_text = doc[i].get_text()
if any(keyword.lower() in page_text.lower() for keyword in keywords):
content_pages.append(page_text)
return "\n".join(content_pages)
# Helper Function: Clean Text
def clean_text(text):
"""
Cleans the text by removing citations, extra whitespace, and unwanted characters.
"""
text = re.sub(r'\[\d+\]', '', text) # Remove citations like [12]
text = re.sub(r'\(\d+\)', '', text) # Remove citations like (3)
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
return text.strip()
# Helper Function: Extract Core Sections
def extract_core_sections(text):
"""
Attempts to extract core sections using common headings.
Returns a dictionary with section name (lowercase) as key and its content as value.
"""
pattern = r'(?i)(Abstract|Introduction|Methods|Results|Conclusions|Discussion)\s*[:\n\.]'
splits = re.split(pattern, text)
sections = {}
if len(splits) > 1:
for i in range(1, len(splits), 2):
heading = splits[i].strip().lower()
content = splits[i+1].strip() if i+1 < len(splits) else ""
sections[heading] = content
return sections
# Helper Function: Remove Header Metadata
def remove_header_metadata(text, marker="Competing Interests:"):
"""
Removes header/metadata from the text by using a marker.
If the marker is found, returns text after it; otherwise, returns the original text.
"""
idx = text.find(marker)
if idx != -1:
return text[idx + len(marker):].strip()
return text
# Helper Function: Split Text into Chunks
def split_into_chunks(text, chunk_size=500):
"""
Splits the text into chunks of approximately chunk_size words.
"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size):
chunk = " ".join(words[i:i+chunk_size])
chunks.append(chunk)
return chunks
# Helper Function: Summarize Text
def summarize_text(text, max_length=200, min_length=50):
"""
Summarizes the given text using BigBird-Pegasus.
Adjusts output lengths if the input is very short.
"""
input_length = len(text.split())
if input_length < 60:
max_length = min(max_length, 40)
min_length = min(min_length, 10)
summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
return summary[0]['summary_text']
# Helper Function: Format Bullet Points
def format_bullet_points(summary):
"""
Splits the summary into sentences and formats each as a bullet point.
"""
sentences = nltk.sent_tokenize(summary)
bullets = ["- " + sentence for sentence in sentences]
return "\n".join(bullets)
# Helper Function: Convert Bullets to Wrapped Paragraph
def bullet_to_paragraph_wrapped(bullet_text, width=80):
"""
Converts bullet point summary into a paragraph and wraps the text to a specified width.
"""
paragraph = bullet_text.replace("- ", "").replace("<n>", " ")
paragraph = re.sub(r'\s+', ' ', paragraph).strip()
wrapped_paragraph = textwrap.fill(paragraph, width=width)
return wrapped_paragraph
# Process PDF Function (Gradio Interface)
def process_pdf(file_obj):
"""
Processes the uploaded PDF file and returns a bullet summary and a wrapped paragraph summary.
"""
# file_obj is a temporary file path provided by Gradio
full_text = read_pdf_with_content_filter(file_obj.name)
cleaned_text = clean_text(full_text)
sections = extract_core_sections(cleaned_text)
if not sections:
core_text = remove_header_metadata(cleaned_text)
else:
order = ['abstract', 'introduction', 'methods', 'results', 'conclusions', 'discussion']
core_content = [sections[sec] for sec in order if sec in sections]
core_text = " ".join(core_content) if core_content else cleaned_text
chunks = split_into_chunks(core_text, chunk_size=500)
chunk_summaries = []
for chunk in chunks:
try:
chunk_summary = summarize_text(chunk, max_length=200, min_length=50)
except Exception as e:
chunk_summary = ""
chunk_summaries.append(chunk_summary)
final_core_summary_text = " ".join(chunk_summaries)
final_summary = summarize_text(final_core_summary_text, max_length=200, min_length=50)
bullet_points = format_bullet_points(final_summary)
paragraph_summary_wrapped = bullet_to_paragraph_wrapped(bullet_points, width=80)
return bullet_points, paragraph_summary_wrapped
# Create Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload a Medical PDF"),
outputs=[
gr.Textbox(label="Bullet Summary"),
gr.Textbox(label="Paragraph Summary")
],
title="Medical Document Summarization",
description="Upload a medical PDF document to get a summarized bullet-point and paragraph summary of its core content."
)
if __name__ == "__main__":
iface.launch()