snkris commited on
Commit
e7c8f40
Β·
verified Β·
1 Parent(s): 489075a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +155 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ from transformers import pipeline, AutoTokenizer
5
+ from pdf2image import convert_from_bytes
6
+ import pytesseract
7
+ import torch
8
+ import re
9
+
10
+ # Configuration
11
+ ABSTRACT_MODEL = "sshleifer/distilbart-cnn-12-6"
12
+ TITLE_MODEL = "linydub/bart-large-samsum"
13
+ MAX_FILE_SIZE_MB = 10
14
+ TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this path!
15
+
16
+ # Set Tesseract path
17
+ pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
18
+
19
+ @st.cache_resource
20
+ def load_models():
21
+ """Load and cache models with proper tokenizers"""
22
+ with st.spinner('πŸš€ Loading AI models (first time 2-5 mins)...'):
23
+ # Abstract model
24
+ abs_tokenizer = AutoTokenizer.from_pretrained(ABSTRACT_MODEL)
25
+ abstractive = pipeline(
26
+ "summarization",
27
+ model=ABSTRACT_MODEL,
28
+ tokenizer=abs_tokenizer,
29
+ device=0 if torch.cuda.is_available() else -1
30
+ )
31
+
32
+ # Title model
33
+ title_tokenizer = AutoTokenizer.from_pretrained(TITLE_MODEL)
34
+ title_pipe = pipeline(
35
+ "text2text-generation",
36
+ model=TITLE_MODEL,
37
+ tokenizer=title_tokenizer,
38
+ max_length=60
39
+ )
40
+
41
+ return abstractive, title_pipe, abs_tokenizer, title_tokenizer
42
+
43
+ def extract_text(pdf_file):
44
+ """Handle both text and image-based PDFs"""
45
+ try:
46
+ # First try regular text extraction
47
+ reader = PdfReader(pdf_file)
48
+ text = " ".join([page.extract_text() or "" for page in reader.pages])
49
+
50
+ # Fallback to OCR if no text found
51
+ if not text.strip():
52
+ images = convert_from_bytes(pdf_file.getvalue())
53
+ text = " ".join([pytesseract.image_to_string(img) for img in images])
54
+
55
+ return clean_text(text)
56
+ except Exception as e:
57
+ st.error(f"PDF Error: {str(e)}")
58
+ return ""
59
+
60
+ def clean_text(text):
61
+ """Remove headers/footers/section numbers"""
62
+ patterns = [
63
+ r'\n\s*(\d+)\s*\n', # Page numbers
64
+ r'Proceedings of .*?\n', # Conference headers
65
+ r'arXiv:\d+\.\d+v\d+.*?\n', # arXiv footers
66
+ r'Β©\d{4}.*?\n', # Copyright
67
+ r'http\S+', # URLs
68
+ r'\b(?:Figure|Table)\s+\d+' # Figure/table captions
69
+ ]
70
+
71
+ for pattern in patterns:
72
+ text = re.sub(pattern, '', text, flags=re.IGNORECASE)
73
+
74
+ return text.strip()
75
+
76
+ def generate_title(abstract, title_pipe):
77
+ """Generate a concise and meaningful research paper title (4-5 words)."""
78
+ prompt = f"Generate a short, research-style title (4-5 words) for this abstract: {abstract}"
79
+
80
+ title = title_pipe(
81
+ prompt,
82
+ num_beams=5,
83
+ early_stopping=True,
84
+ max_length=10, # Limit to ~4-5 words
85
+ do_sample=False
86
+ )[0]['generated_text'].strip()
87
+
88
+ # Remove unwanted tokens
89
+ title = title.replace("<pad>", "").replace("</s>", "").strip()
90
+
91
+ # Ensure title is concise (4-5 words)
92
+ words = title.split()
93
+ if len(words) > 5:
94
+ title = " ".join(words[:5]) # Keep only the first 5 words
95
+
96
+ return title
97
+
98
+ def main():
99
+ # Main title
100
+ st.markdown("<h1 style='text-align: center;'>RESEARCH PAPER TITLE AND ABSTRACT GENERATION</h1>",
101
+ unsafe_allow_html=True)
102
+
103
+ # Upload section
104
+ col1, col2 = st.columns([4, 1])
105
+ with col1:
106
+ uploaded_file = st.file_uploader("Upload here", type=["pdf"], label_visibility="collapsed")
107
+ with col2:
108
+ generate_btn = st.button("ENTER", use_container_width=True)
109
+
110
+ if generate_btn and uploaded_file:
111
+ if uploaded_file.size > MAX_FILE_SIZE_MB * 1024 * 1024:
112
+ st.error(f"File too large! Max {MAX_FILE_SIZE_MB}MB allowed")
113
+ return
114
+
115
+ raw_text = extract_text(uploaded_file)
116
+ if not raw_text.strip():
117
+ st.warning("No text extracted - document might be corrupted")
118
+ return
119
+
120
+ abstract_pipe, title_pipe, abs_tokenizer, title_tokenizer = load_models()
121
+
122
+ with st.status("Processing...", expanded=True) as status:
123
+ try:
124
+ # Processing steps
125
+ st.write("πŸ“– Analyzing document...")
126
+ clean_abstract_text = raw_text[:2000] # First 2000 characters
127
+
128
+ st.write("✍️ Generating abstract...")
129
+ abstract = abstract_pipe(
130
+ clean_abstract_text,
131
+ max_length=150,
132
+ min_length=50,
133
+ do_sample=False
134
+ )[0]['summary_text']
135
+
136
+ st.write("πŸ–‹οΈ Creating title...")
137
+ title = generate_title(abstract, title_pipe)
138
+
139
+ status.update(label="Complete!", state="complete", expanded=False)
140
+
141
+ # Display results
142
+ st.markdown(f"""
143
+ <div style='margin-top: 30px;'>
144
+ <p style='font-size: 14px; font-weight: bold;'>TITLE</p>
145
+ <p style='font-size: 14px; margin-bottom: 20px;'>{title}</p>
146
+ <p style='font-size: 12px; font-weight: bold;'>ABSTRACT</p>
147
+ <p style='font-size: 12px;'>{abstract}</p>
148
+ </div>
149
+ """, unsafe_allow_html=True)
150
+
151
+ except Exception as e:
152
+ st.error(f"Processing failed: {str(e)}")
153
+
154
+ if __name__ == "__main__":
155
+ main()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+ torch==2.3.0+cpu
3
+ streamlit==1.30.0
4
+ PyPDF2==3.0.1
5
+ transformers==4.38.2
6
+ sentencepiece==0.2.0
7
+ pdf2image==1.17.0
8
+ pytesseract==0.3.10
9
+ pillow==10.3.0
10
+ python-dotenv==1.0.1