Spaces:

Kaiyeee
/

Medical_Document_Summarizer

Sleeping

File size: 6,002 Bytes

import re
import nltk
import spacy
import fitz  # PyMuPDF
from transformers import pipeline
import textwrap
import gradio as gr
import spacy



# Download NLTK punkt if not already done
nltk.download('punkt')
nltk.download('punkt_tab')

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    
# Initialize the BigBird-Pegasus summarization pipeline for PubMed texts
summarizer = pipeline("summarization", model="google/bigbird-pegasus-large-pubmed")

# Helper Function: Read PDF with Content Filter
def read_pdf_with_content_filter(file_path, keywords=["Abstract", "Introduction", "Methods", "Results", "Conclusions"]):
    """
    Reads a PDF file and returns text only from pages that contain one of the specified keywords.
    This helps exclude pages that mainly contain header/metadata.
    """
    doc = fitz.open(file_path)
    content_pages = []
    for i in range(len(doc)):
        page_text = doc[i].get_text()
        if any(keyword.lower() in page_text.lower() for keyword in keywords):
            content_pages.append(page_text)
    return "\n".join(content_pages)

# Helper Function: Clean Text
def clean_text(text):
    """
    Cleans the text by removing citations, extra whitespace, and unwanted characters.
    """
    text = re.sub(r'\[\d+\]', '', text)  # Remove citations like [12]
    text = re.sub(r'\(\d+\)', '', text)  # Remove citations like (3)
    text = re.sub(r'\s+', ' ', text)     # Normalize whitespace
    return text.strip()

# Helper Function: Extract Core Sections
def extract_core_sections(text):
    """
    Attempts to extract core sections using common headings.
    Returns a dictionary with section name (lowercase) as key and its content as value.
    """
    pattern = r'(?i)(Abstract|Introduction|Methods|Results|Conclusions|Discussion)\s*[:\n\.]'
    splits = re.split(pattern, text)
    sections = {}
    if len(splits) > 1:
        for i in range(1, len(splits), 2):
            heading = splits[i].strip().lower()
            content = splits[i+1].strip() if i+1 < len(splits) else ""
            sections[heading] = content
    return sections

# Helper Function: Remove Header Metadata
def remove_header_metadata(text, marker="Competing Interests:"):
    """
    Removes header/metadata from the text by using a marker.
    If the marker is found, returns text after it; otherwise, returns the original text.
    """
    idx = text.find(marker)
    if idx != -1:
        return text[idx + len(marker):].strip()
    return text

# Helper Function: Split Text into Chunks
def split_into_chunks(text, chunk_size=500):
    """
    Splits the text into chunks of approximately chunk_size words.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

# Helper Function: Summarize Text
def summarize_text(text, max_length=200, min_length=50):
    """
    Summarizes the given text using BigBird-Pegasus.
    Adjusts output lengths if the input is very short.
    """
    input_length = len(text.split())
    if input_length < 60:
        max_length = min(max_length, 40)
        min_length = min(min_length, 10)
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Helper Function: Format Bullet Points
def format_bullet_points(summary):
    """
    Splits the summary into sentences and formats each as a bullet point.
    """
    sentences = nltk.sent_tokenize(summary)
    bullets = ["- " + sentence for sentence in sentences]
    return "\n".join(bullets)

# Helper Function: Convert Bullets to Wrapped Paragraph
def bullet_to_paragraph_wrapped(bullet_text, width=80):
    """
    Converts bullet point summary into a paragraph and wraps the text to a specified width.
    """
    paragraph = bullet_text.replace("- ", "").replace("<n>", " ")
    paragraph = re.sub(r'\s+', ' ', paragraph).strip()
    wrapped_paragraph = textwrap.fill(paragraph, width=width)
    return wrapped_paragraph

# Process PDF Function (Gradio Interface)
def process_pdf(file_obj):
    """
    Processes the uploaded PDF file and returns a bullet summary and a wrapped paragraph summary.
    """
    # file_obj is a temporary file path provided by Gradio
    full_text = read_pdf_with_content_filter(file_obj.name)
    cleaned_text = clean_text(full_text)
    sections = extract_core_sections(cleaned_text)
    if not sections:
        core_text = remove_header_metadata(cleaned_text)
    else:
        order = ['abstract', 'introduction', 'methods', 'results', 'conclusions', 'discussion']
        core_content = [sections[sec] for sec in order if sec in sections]
        core_text = " ".join(core_content) if core_content else cleaned_text

    chunks = split_into_chunks(core_text, chunk_size=500)
    chunk_summaries = []
    for chunk in chunks:
        try:
            chunk_summary = summarize_text(chunk, max_length=200, min_length=50)
        except Exception as e:
            chunk_summary = ""
        chunk_summaries.append(chunk_summary)
    final_core_summary_text = " ".join(chunk_summaries)
    final_summary = summarize_text(final_core_summary_text, max_length=200, min_length=50)
    bullet_points = format_bullet_points(final_summary)
    paragraph_summary_wrapped = bullet_to_paragraph_wrapped(bullet_points, width=80)
    return bullet_points, paragraph_summary_wrapped

# Create Gradio Interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload a Medical PDF"),
    outputs=[
        gr.Textbox(label="Bullet Summary"),
        gr.Textbox(label="Paragraph Summary")
    ],
    title="Medical Document Summarization",
    description="Upload a medical PDF document to get a summarized bullet-point and paragraph summary of its core content."
)

if __name__ == "__main__":
    iface.launch()