File size: 6,002 Bytes
2e0a452
 
 
 
 
 
26e981c
0c7e107
 
 
26e981c
2e0a452
 
bb07df1
2e0a452
 
0c7e107
 
 
 
 
 
2e0a452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26e981c
 
 
2e0a452
 
26e981c
 
 
 
 
 
 
 
2e0a452
26e981c
 
 
 
 
 
 
 
 
 
 
 
 
 
2e0a452
26e981c
 
 
 
 
 
 
 
 
 
 
2e0a452
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import re
import nltk
import spacy
import fitz  # PyMuPDF
from transformers import pipeline
import textwrap
import gradio as gr
import spacy



# Download NLTK punkt if not already done
nltk.download('punkt')
nltk.download('punkt_tab')

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    
# Initialize the BigBird-Pegasus summarization pipeline for PubMed texts
summarizer = pipeline("summarization", model="google/bigbird-pegasus-large-pubmed")

# Helper Function: Read PDF with Content Filter
def read_pdf_with_content_filter(file_path, keywords=["Abstract", "Introduction", "Methods", "Results", "Conclusions"]):
    """
    Reads a PDF file and returns text only from pages that contain one of the specified keywords.
    This helps exclude pages that mainly contain header/metadata.
    """
    doc = fitz.open(file_path)
    content_pages = []
    for i in range(len(doc)):
        page_text = doc[i].get_text()
        if any(keyword.lower() in page_text.lower() for keyword in keywords):
            content_pages.append(page_text)
    return "\n".join(content_pages)

# Helper Function: Clean Text
def clean_text(text):
    """
    Cleans the text by removing citations, extra whitespace, and unwanted characters.
    """
    text = re.sub(r'\[\d+\]', '', text)  # Remove citations like [12]
    text = re.sub(r'\(\d+\)', '', text)  # Remove citations like (3)
    text = re.sub(r'\s+', ' ', text)     # Normalize whitespace
    return text.strip()

# Helper Function: Extract Core Sections
def extract_core_sections(text):
    """
    Attempts to extract core sections using common headings.
    Returns a dictionary with section name (lowercase) as key and its content as value.
    """
    pattern = r'(?i)(Abstract|Introduction|Methods|Results|Conclusions|Discussion)\s*[:\n\.]'
    splits = re.split(pattern, text)
    sections = {}
    if len(splits) > 1:
        for i in range(1, len(splits), 2):
            heading = splits[i].strip().lower()
            content = splits[i+1].strip() if i+1 < len(splits) else ""
            sections[heading] = content
    return sections

# Helper Function: Remove Header Metadata
def remove_header_metadata(text, marker="Competing Interests:"):
    """
    Removes header/metadata from the text by using a marker.
    If the marker is found, returns text after it; otherwise, returns the original text.
    """
    idx = text.find(marker)
    if idx != -1:
        return text[idx + len(marker):].strip()
    return text

# Helper Function: Split Text into Chunks
def split_into_chunks(text, chunk_size=500):
    """
    Splits the text into chunks of approximately chunk_size words.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

# Helper Function: Summarize Text
def summarize_text(text, max_length=200, min_length=50):
    """
    Summarizes the given text using BigBird-Pegasus.
    Adjusts output lengths if the input is very short.
    """
    input_length = len(text.split())
    if input_length < 60:
        max_length = min(max_length, 40)
        min_length = min(min_length, 10)
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Helper Function: Format Bullet Points
def format_bullet_points(summary):
    """
    Splits the summary into sentences and formats each as a bullet point.
    """
    sentences = nltk.sent_tokenize(summary)
    bullets = ["- " + sentence for sentence in sentences]
    return "\n".join(bullets)

# Helper Function: Convert Bullets to Wrapped Paragraph
def bullet_to_paragraph_wrapped(bullet_text, width=80):
    """
    Converts bullet point summary into a paragraph and wraps the text to a specified width.
    """
    paragraph = bullet_text.replace("- ", "").replace("<n>", " ")
    paragraph = re.sub(r'\s+', ' ', paragraph).strip()
    wrapped_paragraph = textwrap.fill(paragraph, width=width)
    return wrapped_paragraph

# Process PDF Function (Gradio Interface)
def process_pdf(file_obj):
    """
    Processes the uploaded PDF file and returns a bullet summary and a wrapped paragraph summary.
    """
    # file_obj is a temporary file path provided by Gradio
    full_text = read_pdf_with_content_filter(file_obj.name)
    cleaned_text = clean_text(full_text)
    sections = extract_core_sections(cleaned_text)
    if not sections:
        core_text = remove_header_metadata(cleaned_text)
    else:
        order = ['abstract', 'introduction', 'methods', 'results', 'conclusions', 'discussion']
        core_content = [sections[sec] for sec in order if sec in sections]
        core_text = " ".join(core_content) if core_content else cleaned_text

    chunks = split_into_chunks(core_text, chunk_size=500)
    chunk_summaries = []
    for chunk in chunks:
        try:
            chunk_summary = summarize_text(chunk, max_length=200, min_length=50)
        except Exception as e:
            chunk_summary = ""
        chunk_summaries.append(chunk_summary)
    final_core_summary_text = " ".join(chunk_summaries)
    final_summary = summarize_text(final_core_summary_text, max_length=200, min_length=50)
    bullet_points = format_bullet_points(final_summary)
    paragraph_summary_wrapped = bullet_to_paragraph_wrapped(bullet_points, width=80)
    return bullet_points, paragraph_summary_wrapped

# Create Gradio Interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload a Medical PDF"),
    outputs=[
        gr.Textbox(label="Bullet Summary"),
        gr.Textbox(label="Paragraph Summary")
    ],
    title="Medical Document Summarization",
    description="Upload a medical PDF document to get a summarized bullet-point and paragraph summary of its core content."
)

if __name__ == "__main__":
    iface.launch()