Spaces:

snkris
/

Research-Paper-Summarizer

Sleeping

App Files Files Community

snkris commited on Mar 24

Commit

e7c8f40

verified ·

1 Parent(s): 489075a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +155 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# app.py
+import streamlit as st
+from PyPDF2 import PdfReader
+from transformers import pipeline, AutoTokenizer
+from pdf2image import convert_from_bytes
+import pytesseract
+import torch
+import re
+# Configuration
+ABSTRACT_MODEL = "sshleifer/distilbart-cnn-12-6"
+TITLE_MODEL = "linydub/bart-large-samsum"
+MAX_FILE_SIZE_MB = 10
+TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path!
+# Set Tesseract path
+pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
+@st.cache_resource
+def load_models():
+    """Load and cache models with proper tokenizers"""
+    with st.spinner('🚀 Loading AI models (first time 2-5 mins)...'):
+        # Abstract model
+        abs_tokenizer = AutoTokenizer.from_pretrained(ABSTRACT_MODEL)
+        abstractive = pipeline(
+            "summarization",
+            model=ABSTRACT_MODEL,
+            tokenizer=abs_tokenizer,
+            device=0 if torch.cuda.is_available() else -1
+        )
+        # Title model
+        title_tokenizer = AutoTokenizer.from_pretrained(TITLE_MODEL)
+        title_pipe = pipeline(
+            "text2text-generation",
+            model=TITLE_MODEL,
+            tokenizer=title_tokenizer,
+            max_length=60
+        )
+    return abstractive, title_pipe, abs_tokenizer, title_tokenizer
+def extract_text(pdf_file):
+    """Handle both text and image-based PDFs"""
+    try:
+        # First try regular text extraction
+        reader = PdfReader(pdf_file)
+        text = " ".join([page.extract_text() or "" for page in reader.pages])
+        # Fallback to OCR if no text found
+        if not text.strip():
+            images = convert_from_bytes(pdf_file.getvalue())
+            text = " ".join([pytesseract.image_to_string(img) for img in images])
+        return clean_text(text)
+    except Exception as e:
+        st.error(f"PDF Error: {str(e)}")
+        return ""
+def clean_text(text):
+    """Remove headers/footers/section numbers"""
+    patterns = [
+        r'\n\s*(\d+)\s*\n',          # Page numbers
+        r'Proceedings of .*?\n',      # Conference headers
+        r'arXiv:\d+\.\d+v\d+.*?\n',   # arXiv footers
+        r'©\d{4}.*?\n',               # Copyright
+        r'http\S+',                   # URLs
+        r'\b(?:Figure|Table)\s+\d+'   # Figure/table captions
+    ]
+    for pattern in patterns:
+        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+    return text.strip()
+def generate_title(abstract, title_pipe):
+    """Generate a concise and meaningful research paper title (4-5 words)."""
+    prompt = f"Generate a short, research-style title (4-5 words) for this abstract: {abstract}"
+    title = title_pipe(
+        prompt,
+        num_beams=5,
+        early_stopping=True,
+        max_length=10,  # Limit to ~4-5 words
+        do_sample=False
+    )[0]['generated_text'].strip()
+    # Remove unwanted tokens
+    title = title.replace("<pad>", "").replace("</s>", "").strip()
+    # Ensure title is concise (4-5 words)
+    words = title.split()
+    if len(words) > 5:
+        title = " ".join(words[:5])  # Keep only the first 5 words
+    return title
+def main():
+    # Main title
+    st.markdown("<h1 style='text-align: center;'>RESEARCH PAPER TITLE AND ABSTRACT GENERATION</h1>",
+                unsafe_allow_html=True)
+    # Upload section
+    col1, col2 = st.columns([4, 1])
+    with col1:
+        uploaded_file = st.file_uploader("Upload here", type=["pdf"], label_visibility="collapsed")
+    with col2:
+        generate_btn = st.button("ENTER", use_container_width=True)
+    if generate_btn and uploaded_file:
+        if uploaded_file.size > MAX_FILE_SIZE_MB * 1024 * 1024:
+            st.error(f"File too large! Max {MAX_FILE_SIZE_MB}MB allowed")
+            return
+        raw_text = extract_text(uploaded_file)
+        if not raw_text.strip():
+            st.warning("No text extracted - document might be corrupted")
+            return
+        abstract_pipe, title_pipe, abs_tokenizer, title_tokenizer = load_models()
+        with st.status("Processing...", expanded=True) as status:
+            try:
+                # Processing steps
+                st.write("📖 Analyzing document...")
+                clean_abstract_text = raw_text[:2000]  # First 2000 characters
+                st.write("✍️ Generating abstract...")
+                abstract = abstract_pipe(
+                    clean_abstract_text,
+                    max_length=150,
+                    min_length=50,
+                    do_sample=False
+                )[0]['summary_text']
+                st.write("🖋️ Creating title...")
+                title = generate_title(abstract, title_pipe)
+                status.update(label="Complete!", state="complete", expanded=False)
+                # Display results
+                st.markdown(f"""
+                <div style='margin-top: 30px;'>
+                    <p style='font-size: 14px; font-weight: bold;'>TITLE</p>
+                    <p style='font-size: 14px; margin-bottom: 20px;'>{title}</p>
+                    <p style='font-size: 12px; font-weight: bold;'>ABSTRACT</p>
+                    <p style='font-size: 12px;'>{abstract}</p>
+                </div>
+                """, unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"Processing failed: {str(e)}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.3.0+cpu
+streamlit==1.30.0
+PyPDF2==3.0.1
+transformers==4.38.2
+sentencepiece==0.2.0
+pdf2image==1.17.0
+pytesseract==0.3.10
+pillow==10.3.0
+python-dotenv==1.0.1