File size: 5,377 Bytes
e7c8f40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# app.py
import streamlit as st
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer
from pdf2image import convert_from_bytes
import pytesseract
import torch
import re

# Configuration
ABSTRACT_MODEL = "sshleifer/distilbart-cnn-12-6"
TITLE_MODEL = "linydub/bart-large-samsum"
MAX_FILE_SIZE_MB = 10
TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path!

# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH

@st.cache_resource
def load_models():
    """Load and cache models with proper tokenizers"""
    with st.spinner('πŸš€ Loading AI models (first time 2-5 mins)...'):
        # Abstract model
        abs_tokenizer = AutoTokenizer.from_pretrained(ABSTRACT_MODEL)
        abstractive = pipeline(
            "summarization",
            model=ABSTRACT_MODEL,
            tokenizer=abs_tokenizer,
            device=0 if torch.cuda.is_available() else -1
        )

        # Title model
        title_tokenizer = AutoTokenizer.from_pretrained(TITLE_MODEL)
        title_pipe = pipeline(
            "text2text-generation",
            model=TITLE_MODEL,
            tokenizer=title_tokenizer,
            max_length=60
        )

    return abstractive, title_pipe, abs_tokenizer, title_tokenizer

def extract_text(pdf_file):
    """Handle both text and image-based PDFs"""
    try:
        # First try regular text extraction
        reader = PdfReader(pdf_file)
        text = " ".join([page.extract_text() or "" for page in reader.pages])
        
        # Fallback to OCR if no text found
        if not text.strip():
            images = convert_from_bytes(pdf_file.getvalue())
            text = " ".join([pytesseract.image_to_string(img) for img in images])
            
        return clean_text(text)
    except Exception as e:
        st.error(f"PDF Error: {str(e)}")
        return ""

def clean_text(text):
    """Remove headers/footers/section numbers"""
    patterns = [
        r'\n\s*(\d+)\s*\n',          # Page numbers
        r'Proceedings of .*?\n',      # Conference headers
        r'arXiv:\d+\.\d+v\d+.*?\n',   # arXiv footers
        r'Β©\d{4}.*?\n',               # Copyright
        r'http\S+',                   # URLs
        r'\b(?:Figure|Table)\s+\d+'   # Figure/table captions
    ]
    
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
        
    return text.strip()

def generate_title(abstract, title_pipe):
    """Generate a concise and meaningful research paper title (4-5 words)."""
    prompt = f"Generate a short, research-style title (4-5 words) for this abstract: {abstract}"
    
    title = title_pipe(
        prompt,
        num_beams=5,
        early_stopping=True,
        max_length=10,  # Limit to ~4-5 words
        do_sample=False
    )[0]['generated_text'].strip()

    # Remove unwanted tokens
    title = title.replace("<pad>", "").replace("</s>", "").strip()

    # Ensure title is concise (4-5 words)
    words = title.split()
    if len(words) > 5:
        title = " ".join(words[:5])  # Keep only the first 5 words

    return title

def main():
    # Main title
    st.markdown("<h1 style='text-align: center;'>RESEARCH PAPER TITLE AND ABSTRACT GENERATION</h1>", 
                unsafe_allow_html=True)
    
    # Upload section
    col1, col2 = st.columns([4, 1])
    with col1:
        uploaded_file = st.file_uploader("Upload here", type=["pdf"], label_visibility="collapsed")
    with col2:
        generate_btn = st.button("ENTER", use_container_width=True)

    if generate_btn and uploaded_file:
        if uploaded_file.size > MAX_FILE_SIZE_MB * 1024 * 1024:
            st.error(f"File too large! Max {MAX_FILE_SIZE_MB}MB allowed")
            return

        raw_text = extract_text(uploaded_file)
        if not raw_text.strip():
            st.warning("No text extracted - document might be corrupted")
            return

        abstract_pipe, title_pipe, abs_tokenizer, title_tokenizer = load_models()

        with st.status("Processing...", expanded=True) as status:
            try:
                # Processing steps
                st.write("πŸ“– Analyzing document...")
                clean_abstract_text = raw_text[:2000]  # First 2000 characters
                
                st.write("✍️ Generating abstract...")
                abstract = abstract_pipe(
                    clean_abstract_text,
                    max_length=150,
                    min_length=50,
                    do_sample=False
                )[0]['summary_text']

                st.write("πŸ–‹οΈ Creating title...")
                title = generate_title(abstract, title_pipe)

                status.update(label="Complete!", state="complete", expanded=False)

                # Display results
                st.markdown(f"""
                <div style='margin-top: 30px;'>
                    <p style='font-size: 14px; font-weight: bold;'>TITLE</p>
                    <p style='font-size: 14px; margin-bottom: 20px;'>{title}</p>
                    <p style='font-size: 12px; font-weight: bold;'>ABSTRACT</p>
                    <p style='font-size: 12px;'>{abstract}</p>
                </div>
                """, unsafe_allow_html=True)

            except Exception as e:
                st.error(f"Processing failed: {str(e)}")

if __name__ == "__main__":
    main()