Spaces:
Sleeping
Sleeping
File size: 5,377 Bytes
e7c8f40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# app.py
import streamlit as st
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer
from pdf2image import convert_from_bytes
import pytesseract
import torch
import re
# Configuration
ABSTRACT_MODEL = "sshleifer/distilbart-cnn-12-6"
TITLE_MODEL = "linydub/bart-large-samsum"
MAX_FILE_SIZE_MB = 10
TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this path!
# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
@st.cache_resource
def load_models():
"""Load and cache models with proper tokenizers"""
with st.spinner('π Loading AI models (first time 2-5 mins)...'):
# Abstract model
abs_tokenizer = AutoTokenizer.from_pretrained(ABSTRACT_MODEL)
abstractive = pipeline(
"summarization",
model=ABSTRACT_MODEL,
tokenizer=abs_tokenizer,
device=0 if torch.cuda.is_available() else -1
)
# Title model
title_tokenizer = AutoTokenizer.from_pretrained(TITLE_MODEL)
title_pipe = pipeline(
"text2text-generation",
model=TITLE_MODEL,
tokenizer=title_tokenizer,
max_length=60
)
return abstractive, title_pipe, abs_tokenizer, title_tokenizer
def extract_text(pdf_file):
"""Handle both text and image-based PDFs"""
try:
# First try regular text extraction
reader = PdfReader(pdf_file)
text = " ".join([page.extract_text() or "" for page in reader.pages])
# Fallback to OCR if no text found
if not text.strip():
images = convert_from_bytes(pdf_file.getvalue())
text = " ".join([pytesseract.image_to_string(img) for img in images])
return clean_text(text)
except Exception as e:
st.error(f"PDF Error: {str(e)}")
return ""
def clean_text(text):
"""Remove headers/footers/section numbers"""
patterns = [
r'\n\s*(\d+)\s*\n', # Page numbers
r'Proceedings of .*?\n', # Conference headers
r'arXiv:\d+\.\d+v\d+.*?\n', # arXiv footers
r'Β©\d{4}.*?\n', # Copyright
r'http\S+', # URLs
r'\b(?:Figure|Table)\s+\d+' # Figure/table captions
]
for pattern in patterns:
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
return text.strip()
def generate_title(abstract, title_pipe):
"""Generate a concise and meaningful research paper title (4-5 words)."""
prompt = f"Generate a short, research-style title (4-5 words) for this abstract: {abstract}"
title = title_pipe(
prompt,
num_beams=5,
early_stopping=True,
max_length=10, # Limit to ~4-5 words
do_sample=False
)[0]['generated_text'].strip()
# Remove unwanted tokens
title = title.replace("<pad>", "").replace("</s>", "").strip()
# Ensure title is concise (4-5 words)
words = title.split()
if len(words) > 5:
title = " ".join(words[:5]) # Keep only the first 5 words
return title
def main():
# Main title
st.markdown("<h1 style='text-align: center;'>RESEARCH PAPER TITLE AND ABSTRACT GENERATION</h1>",
unsafe_allow_html=True)
# Upload section
col1, col2 = st.columns([4, 1])
with col1:
uploaded_file = st.file_uploader("Upload here", type=["pdf"], label_visibility="collapsed")
with col2:
generate_btn = st.button("ENTER", use_container_width=True)
if generate_btn and uploaded_file:
if uploaded_file.size > MAX_FILE_SIZE_MB * 1024 * 1024:
st.error(f"File too large! Max {MAX_FILE_SIZE_MB}MB allowed")
return
raw_text = extract_text(uploaded_file)
if not raw_text.strip():
st.warning("No text extracted - document might be corrupted")
return
abstract_pipe, title_pipe, abs_tokenizer, title_tokenizer = load_models()
with st.status("Processing...", expanded=True) as status:
try:
# Processing steps
st.write("π Analyzing document...")
clean_abstract_text = raw_text[:2000] # First 2000 characters
st.write("βοΈ Generating abstract...")
abstract = abstract_pipe(
clean_abstract_text,
max_length=150,
min_length=50,
do_sample=False
)[0]['summary_text']
st.write("ποΈ Creating title...")
title = generate_title(abstract, title_pipe)
status.update(label="Complete!", state="complete", expanded=False)
# Display results
st.markdown(f"""
<div style='margin-top: 30px;'>
<p style='font-size: 14px; font-weight: bold;'>TITLE</p>
<p style='font-size: 14px; margin-bottom: 20px;'>{title}</p>
<p style='font-size: 12px; font-weight: bold;'>ABSTRACT</p>
<p style='font-size: 12px;'>{abstract}</p>
</div>
""", unsafe_allow_html=True)
except Exception as e:
st.error(f"Processing failed: {str(e)}")
if __name__ == "__main__":
main()
|