snkris's picture
Upload 2 files
e7c8f40 verified
# app.py
import streamlit as st
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer
from pdf2image import convert_from_bytes
import pytesseract
import torch
import re
# Configuration
ABSTRACT_MODEL = "sshleifer/distilbart-cnn-12-6"
TITLE_MODEL = "linydub/bart-large-samsum"
MAX_FILE_SIZE_MB = 10
TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this path!
# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
@st.cache_resource
def load_models():
"""Load and cache models with proper tokenizers"""
with st.spinner('πŸš€ Loading AI models (first time 2-5 mins)...'):
# Abstract model
abs_tokenizer = AutoTokenizer.from_pretrained(ABSTRACT_MODEL)
abstractive = pipeline(
"summarization",
model=ABSTRACT_MODEL,
tokenizer=abs_tokenizer,
device=0 if torch.cuda.is_available() else -1
)
# Title model
title_tokenizer = AutoTokenizer.from_pretrained(TITLE_MODEL)
title_pipe = pipeline(
"text2text-generation",
model=TITLE_MODEL,
tokenizer=title_tokenizer,
max_length=60
)
return abstractive, title_pipe, abs_tokenizer, title_tokenizer
def extract_text(pdf_file):
"""Handle both text and image-based PDFs"""
try:
# First try regular text extraction
reader = PdfReader(pdf_file)
text = " ".join([page.extract_text() or "" for page in reader.pages])
# Fallback to OCR if no text found
if not text.strip():
images = convert_from_bytes(pdf_file.getvalue())
text = " ".join([pytesseract.image_to_string(img) for img in images])
return clean_text(text)
except Exception as e:
st.error(f"PDF Error: {str(e)}")
return ""
def clean_text(text):
"""Remove headers/footers/section numbers"""
patterns = [
r'\n\s*(\d+)\s*\n', # Page numbers
r'Proceedings of .*?\n', # Conference headers
r'arXiv:\d+\.\d+v\d+.*?\n', # arXiv footers
r'Β©\d{4}.*?\n', # Copyright
r'http\S+', # URLs
r'\b(?:Figure|Table)\s+\d+' # Figure/table captions
]
for pattern in patterns:
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
return text.strip()
def generate_title(abstract, title_pipe):
"""Generate a concise and meaningful research paper title (4-5 words)."""
prompt = f"Generate a short, research-style title (4-5 words) for this abstract: {abstract}"
title = title_pipe(
prompt,
num_beams=5,
early_stopping=True,
max_length=10, # Limit to ~4-5 words
do_sample=False
)[0]['generated_text'].strip()
# Remove unwanted tokens
title = title.replace("<pad>", "").replace("</s>", "").strip()
# Ensure title is concise (4-5 words)
words = title.split()
if len(words) > 5:
title = " ".join(words[:5]) # Keep only the first 5 words
return title
def main():
# Main title
st.markdown("<h1 style='text-align: center;'>RESEARCH PAPER TITLE AND ABSTRACT GENERATION</h1>",
unsafe_allow_html=True)
# Upload section
col1, col2 = st.columns([4, 1])
with col1:
uploaded_file = st.file_uploader("Upload here", type=["pdf"], label_visibility="collapsed")
with col2:
generate_btn = st.button("ENTER", use_container_width=True)
if generate_btn and uploaded_file:
if uploaded_file.size > MAX_FILE_SIZE_MB * 1024 * 1024:
st.error(f"File too large! Max {MAX_FILE_SIZE_MB}MB allowed")
return
raw_text = extract_text(uploaded_file)
if not raw_text.strip():
st.warning("No text extracted - document might be corrupted")
return
abstract_pipe, title_pipe, abs_tokenizer, title_tokenizer = load_models()
with st.status("Processing...", expanded=True) as status:
try:
# Processing steps
st.write("πŸ“– Analyzing document...")
clean_abstract_text = raw_text[:2000] # First 2000 characters
st.write("✍️ Generating abstract...")
abstract = abstract_pipe(
clean_abstract_text,
max_length=150,
min_length=50,
do_sample=False
)[0]['summary_text']
st.write("πŸ–‹οΈ Creating title...")
title = generate_title(abstract, title_pipe)
status.update(label="Complete!", state="complete", expanded=False)
# Display results
st.markdown(f"""
<div style='margin-top: 30px;'>
<p style='font-size: 14px; font-weight: bold;'>TITLE</p>
<p style='font-size: 14px; margin-bottom: 20px;'>{title}</p>
<p style='font-size: 12px; font-weight: bold;'>ABSTRACT</p>
<p style='font-size: 12px;'>{abstract}</p>
</div>
""", unsafe_allow_html=True)
except Exception as e:
st.error(f"Processing failed: {str(e)}")
if __name__ == "__main__":
main()