# app.py import streamlit as st from PyPDF2 import PdfReader from transformers import pipeline, AutoTokenizer from pdf2image import convert_from_bytes import pytesseract import torch import re # Configuration ABSTRACT_MODEL = "sshleifer/distilbart-cnn-12-6" TITLE_MODEL = "linydub/bart-large-samsum" MAX_FILE_SIZE_MB = 10 TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this path! # Set Tesseract path pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH @st.cache_resource def load_models(): """Load and cache models with proper tokenizers""" with st.spinner('🚀 Loading AI models (first time 2-5 mins)...'): # Abstract model abs_tokenizer = AutoTokenizer.from_pretrained(ABSTRACT_MODEL) abstractive = pipeline( "summarization", model=ABSTRACT_MODEL, tokenizer=abs_tokenizer, device=0 if torch.cuda.is_available() else -1 ) # Title model title_tokenizer = AutoTokenizer.from_pretrained(TITLE_MODEL) title_pipe = pipeline( "text2text-generation", model=TITLE_MODEL, tokenizer=title_tokenizer, max_length=60 ) return abstractive, title_pipe, abs_tokenizer, title_tokenizer def extract_text(pdf_file): """Handle both text and image-based PDFs""" try: # First try regular text extraction reader = PdfReader(pdf_file) text = " ".join([page.extract_text() or "" for page in reader.pages]) # Fallback to OCR if no text found if not text.strip(): images = convert_from_bytes(pdf_file.getvalue()) text = " ".join([pytesseract.image_to_string(img) for img in images]) return clean_text(text) except Exception as e: st.error(f"PDF Error: {str(e)}") return "" def clean_text(text): """Remove headers/footers/section numbers""" patterns = [ r'\n\s*(\d+)\s*\n', # Page numbers r'Proceedings of .*?\n', # Conference headers r'arXiv:\d+\.\d+v\d+.*?\n', # arXiv footers r'©\d{4}.*?\n', # Copyright r'http\S+', # URLs r'\b(?:Figure|Table)\s+\d+' # Figure/table captions ] for pattern in patterns: text = re.sub(pattern, '', text, flags=re.IGNORECASE) return text.strip() def generate_title(abstract, title_pipe): """Generate a concise and meaningful research paper title (4-5 words).""" prompt = f"Generate a short, research-style title (4-5 words) for this abstract: {abstract}" title = title_pipe( prompt, num_beams=5, early_stopping=True, max_length=10, # Limit to ~4-5 words do_sample=False )[0]['generated_text'].strip() # Remove unwanted tokens title = title.replace("", "").replace("", "").strip() # Ensure title is concise (4-5 words) words = title.split() if len(words) > 5: title = " ".join(words[:5]) # Keep only the first 5 words return title def main(): # Main title st.markdown("

RESEARCH PAPER TITLE AND ABSTRACT GENERATION

", unsafe_allow_html=True) # Upload section col1, col2 = st.columns([4, 1]) with col1: uploaded_file = st.file_uploader("Upload here", type=["pdf"], label_visibility="collapsed") with col2: generate_btn = st.button("ENTER", use_container_width=True) if generate_btn and uploaded_file: if uploaded_file.size > MAX_FILE_SIZE_MB * 1024 * 1024: st.error(f"File too large! Max {MAX_FILE_SIZE_MB}MB allowed") return raw_text = extract_text(uploaded_file) if not raw_text.strip(): st.warning("No text extracted - document might be corrupted") return abstract_pipe, title_pipe, abs_tokenizer, title_tokenizer = load_models() with st.status("Processing...", expanded=True) as status: try: # Processing steps st.write("📖 Analyzing document...") clean_abstract_text = raw_text[:2000] # First 2000 characters st.write("✍️ Generating abstract...") abstract = abstract_pipe( clean_abstract_text, max_length=150, min_length=50, do_sample=False )[0]['summary_text'] st.write("🖋️ Creating title...") title = generate_title(abstract, title_pipe) status.update(label="Complete!", state="complete", expanded=False) # Display results st.markdown(f"""

TITLE

{title}

ABSTRACT

{abstract}

""", unsafe_allow_html=True) except Exception as e: st.error(f"Processing failed: {str(e)}") if __name__ == "__main__": main()