awacke1 commited on
Commit
7b8a04f
Β·
verified Β·
1 Parent(s): 21021f4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +300 -0
app.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import gradio as gr
5
+ import time
6
+ import os
7
+ import json
8
+ import PyPDF2
9
+ import io
10
+ import asyncio
11
+ import aiohttp
12
+ import aiofiles
13
+ import re
14
+ from datetime import datetime
15
+ import base64
16
+ import zipfile
17
+
18
+ # πŸ§™β€β™‚οΈ Magical Utility Functions πŸ§™β€β™‚οΈ
19
+
20
+ def safe_filename(title):
21
+ """Convert a string to a safe filename. No more 'file not found' nightmares! πŸ™…β€β™‚οΈπŸ“"""
22
+ return re.sub(r'[^\w\-_\. ]', '_', title)
23
+
24
+ def create_timestamp_directory():
25
+ """Create a directory named with the current date and time. It's like a time capsule for your downloads! πŸ—“οΈπŸ“¦"""
26
+ date_str = datetime.now().strftime("%m-%d-%Y-%H-%M")
27
+ os.makedirs(date_str, exist_ok=True)
28
+ return date_str
29
+
30
+ def get_base64_download_link(file_path, filename):
31
+ """Create a base64 download link for a binary file. It's like teleportation for your files! πŸŒŸπŸ“²"""
32
+ with open(file_path, "rb") as f:
33
+ content = f.read()
34
+ b64 = base64.b64encode(content).decode()
35
+ return f'<a href="data:application/zip;base64,{b64}" download="{filename}">Download {filename}</a>'
36
+
37
+ # 🎬 Animated Banner Messages 🎬
38
+ def animated_banner(message, emoji):
39
+ """Create an animated banner message. It's like a tiny parade for your console! πŸŽ‰πŸš©"""
40
+ frames = [
41
+ f"╔════ {emoji} ════╗\nβ•‘ {message:^16} β•‘\nβ•šβ•β•β•β•β•β•β•β•β•β•β•β•β•",
42
+ f"╔════ {emoji} ════╗\nβ•‘ {message:^16} β•‘\nβ•šβ•β•β•β•β•β•β•β•β•β•β•β•β•",
43
+ f"╔════{emoji}════╗\nβ•‘ {message:^14} β•‘\nβ•šβ•β•β•β•β•β•β•β•β•β•β•",
44
+ f"╔═══{emoji}═══╗\nβ•‘ {message:^12} β•‘\nβ•šβ•β•β•β•β•β•β•β•β•",
45
+ f"╔══{emoji}══╗\nβ•‘ {message:^10} β•‘\nβ•šβ•β•β•β•β•β•β•",
46
+ f"╔═{emoji}═╗\nβ•‘ {message:^8} β•‘\nβ•šβ•β•β•β•β•",
47
+ f"β•”{emoji}β•—\nβ•‘ {message:^6} β•‘\nβ•šβ•β•β•",
48
+ ]
49
+ return frames
50
+
51
+ # πŸ•΅οΈβ€β™‚οΈ Data Fetching and Caching Shenanigans πŸ•΅οΈβ€β™‚οΈ
52
+
53
+ def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
54
+ """Fetch papers from the interwebs. It's like fishing, but for knowledge! πŸŽ£πŸ“š"""
55
+ base_url = "https://paperswithcode.com"
56
+ session = requests.Session()
57
+ headers = {
58
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
59
+ 'Cache-Control': 'no-cache'
60
+ }
61
+ print("Time run at : ", time.ctime())
62
+ offset = 0
63
+ data_list = {}
64
+ break_duplicate = 10
65
+
66
+ while True:
67
+ response = session.get(url, headers=headers, params={'page': offset})
68
+ if response.status_code != 200:
69
+ print('Failed to retrieve data')
70
+ break
71
+ soup = BeautifulSoup(response.text, 'html.parser')
72
+ paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
73
+ if not paper_info:
74
+ print("No paper information found.")
75
+ break
76
+ for ppr in paper_info:
77
+ title = ppr.find('h1').text.strip()
78
+
79
+ if "paper" in ppr.find('a')['href']:
80
+ link = base_url + ppr.find('a')['href']
81
+ else:
82
+ link = ppr.find('a')['href']
83
+
84
+ Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '') if ppr.find('span', class_='badge badge-secondary') else "0"
85
+
86
+ pdf_link = ''
87
+ try:
88
+ response_link = session.get(link, headers=headers)
89
+ soup_link = BeautifulSoup(response_link.text, 'html.parser')
90
+ paper_info_link = soup_link.find_all('div', class_='paper-abstract')
91
+ pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
92
+ except Exception as e:
93
+ print(f"Failed to retrieve PDF link for {title}: {e}")
94
+
95
+ print(f"Title: {title}, Link: {link}, Github Star: {Github_Star}, PDF Link: {pdf_link}")
96
+
97
+ if title not in data_list:
98
+ data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
99
+ else:
100
+ break_duplicate -= 1
101
+ if break_duplicate == 0:
102
+ return data_list
103
+ offset += 1
104
+ progress.update(offset)
105
+ print('Data retrieval complete')
106
+ return data_list
107
+
108
+ def load_cached_data(cache_file):
109
+ """Load cached data. It's like finding money in your old jeans! πŸ’°πŸ§΅"""
110
+ if os.path.exists(cache_file):
111
+ with open(cache_file, 'r') as f:
112
+ return json.load(f)
113
+ return None
114
+
115
+ def save_cached_data(data, cache_file):
116
+ """Save data to cache. Future you will thank present you! πŸ¦Έβ€β™‚οΈπŸ•°οΈ"""
117
+ with open(cache_file, 'w') as f:
118
+ json.dump(data, f)
119
+
120
+ def load_and_cache_data(url, cache_file):
121
+ """Load data from cache or fetch new data. It's like a time machine for your data! β°πŸ”„"""
122
+ cached_data = load_cached_data(cache_file)
123
+
124
+ if cached_data:
125
+ print(f"Loading cached data from {cache_file}")
126
+ return cached_data
127
+
128
+ print(f"Fetching new data from {url}")
129
+ new_data = get_rank_papers(url)
130
+ save_cached_data(new_data, cache_file)
131
+ return new_data
132
+
133
+ # πŸ“Š Data Processing and Display Magic πŸ“Š
134
+
135
+ def format_dataframe(data):
136
+ """Format data into a pretty DataFrame. It's like giving your data a makeover! πŸ’…πŸ“ˆ"""
137
+ if not data:
138
+ print("No data found to format.")
139
+ return pd.DataFrame()
140
+
141
+ df = pd.DataFrame(data).T
142
+ df['title'] = df.index
143
+
144
+ # Check if required columns are present
145
+ if 'Github Star' in df.columns and 'link' in df.columns and 'pdf_link' in df.columns:
146
+ df = df[['title', 'Github Star', 'link', 'pdf_link']]
147
+ df = df.sort_values(by='Github Star', ascending=False)
148
+ df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
149
+ df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
150
+ else:
151
+ print("Required columns are missing in the dataframe.")
152
+ print(f"Columns available: {df.columns}")
153
+
154
+ return df
155
+
156
+ def update_display(category):
157
+ """Update the display for a category. Freshen up your data like it's spring cleaning! 🧹🌸"""
158
+ cache_file = f"{category}_papers_cache.json"
159
+ url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
160
+
161
+ data = load_and_cache_data(url, cache_file)
162
+ df = format_dataframe(data)
163
+
164
+ return len(df), df.to_html(escape=False, index=False)
165
+
166
+ def load_all_data():
167
+ """Load data for all categories. It's like a buffet for your brain! 🧠🍽️"""
168
+ top_count, top_html = update_display("top")
169
+ new_count, new_html = update_display("latest")
170
+ greatest_count, greatest_html = update_display("greatest")
171
+ return top_count, top_html, new_count, new_html, greatest_count, greatest_html
172
+
173
+ # πŸš€ Asynchronous Paper Downloading and Zipping πŸš€
174
+
175
+ async def download_pdf(session, title, paper_info, directory):
176
+ """Download a PDF file. It's like collecting treasures from the internet! πŸ’ŽπŸ“„"""
177
+ pdf_url = paper_info['pdf_link']
178
+ if not pdf_url:
179
+ return f"🚫 No PDF link for: {title}. It's playing hide and seek! πŸ™ˆ", None
180
+
181
+ try:
182
+ timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
183
+ async with session.get(pdf_url, timeout=timeout) as response:
184
+ if response.status != 200:
185
+ return f"🚨 Failed to grab PDF for {title}: HTTP {response.status}. The internet gremlins strike again! πŸ‘Ή", None
186
+ pdf_content = await response.read()
187
+
188
+ file_length = len(pdf_content)
189
+ if file_length < 5000: # Check if the PDF is less than 5KB
190
+ return f"🐜 PDF for {title} is tiny ({file_length} bytes). It's like a paper for ants! πŸœπŸ“„", None
191
+
192
+ # Save the PDF to the directory
193
+ safe_title = safe_filename(title)
194
+ pdf_filename = f"{safe_title}.pdf"
195
+ pdf_filepath = os.path.join(directory, pdf_filename)
196
+
197
+ async with aiofiles.open(pdf_filepath, 'wb') as f:
198
+ await f.write(pdf_content)
199
+
200
+ return f"πŸŽ‰ Successfully downloaded: {pdf_filename} (File length: {file_length} bytes).", pdf_filepath
201
+ except asyncio.TimeoutError:
202
+ return f"⏳ Timeout for {title}. The PDF is playing hard to get! πŸ’ƒ", None
203
+ except Exception as e:
204
+ return f"πŸ’₯ Oops! Error downloading {title}: {str(e)}. Gremlins in the system! πŸ› οΈ", None
205
+
206
+ async def process_papers(data, directory, progress=gr.Progress()):
207
+ """Process multiple papers asynchronously by downloading their PDFs. πŸ€Ήβ€β™‚οΈπŸ“š"""
208
+ async with aiohttp.ClientSession() as session:
209
+ tasks = []
210
+ for title, paper_info in data.items():
211
+ task = asyncio.ensure_future(download_pdf(session, title, paper_info, directory))
212
+ tasks.append(task)
213
+
214
+ results = []
215
+ pdf_files = []
216
+ for i, task in enumerate(asyncio.as_completed(tasks), start=1):
217
+ result, pdf_filepath = await task
218
+ results.append(result)
219
+ if pdf_filepath:
220
+ pdf_files.append(pdf_filepath)
221
+ progress(i / len(tasks), f"πŸš€ Processed {i}/{len(tasks)} papers. Downloading...")
222
+
223
+ return results, pdf_files
224
+
225
+ def zip_files(directory):
226
+ """Zip all files in the given directory."""
227
+ zip_filename = f"{directory}.zip"
228
+ zip_filepath = os.path.join(directory, zip_filename)
229
+
230
+ with zipfile.ZipFile(zip_filepath, 'w') as zipf:
231
+ for root, _, files in os.walk(directory):
232
+ for file in files:
233
+ file_path = os.path.join(root, file)
234
+ zipf.write(file_path, arcname=file)
235
+
236
+ return zip_filepath
237
+
238
+ def download_all_papers(progress=gr.Progress()):
239
+ """Download and zip all papers. It's like hosting a paper party, and everyone's invited! πŸŽ‰πŸ“š"""
240
+ all_data = {}
241
+ for category in ["top", "latest", "greatest"]:
242
+ cache_file = f"{category}_papers_cache.json"
243
+ data = load_cached_data(cache_file)
244
+ if data:
245
+ all_data.update(data)
246
+
247
+ # Create timestamped directory
248
+ date_directory = create_timestamp_directory()
249
+
250
+ # Download papers
251
+ results, pdf_files = asyncio.run(process_papers(all_data, date_directory, progress))
252
+
253
+ # Zip the directory
254
+ zip_filepath = zip_files(date_directory)
255
+
256
+ # Create a download link
257
+ download_link = get_base64_download_link(zip_filepath, f"{date_directory}.zip")
258
+
259
+ summary = f"πŸ“Š Papers processed: {len(all_data)} (We're basically librarians now!)\n"
260
+ summary += f"βœ… Successfully downloaded and zipped: {len(pdf_files)} (Take that, PDF gremlins!)\n"
261
+ summary += f"❌ Errors: {len(results) - len(pdf_files)} (Even superheroes have off days)\n\n"
262
+ summary += download_link
263
+
264
+ return summary, "<br>".join(results)
265
+
266
+ # 🎭 Gradio Interface: Where the Magic Happens 🎭
267
+
268
+ with gr.Blocks() as demo:
269
+ gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
270
+
271
+ with gr.Tab("Top Trending Papers"):
272
+ top_count = gr.Textbox(label="Number of Papers Fetched")
273
+ top_html = gr.HTML()
274
+ top_button = gr.Button("Refresh Leaderboard")
275
+ top_button.click(fn=lambda: update_display("top"), inputs=None, outputs=[top_count, top_html])
276
+
277
+ with gr.Tab("New Papers"):
278
+ new_count = gr.Textbox(label="Number of Papers Fetched")
279
+ new_html = gr.HTML()
280
+ new_button = gr.Button("Refresh Leaderboard")
281
+ new_button.click(fn=lambda: update_display("latest"), inputs=None, outputs=[new_count, new_html])
282
+
283
+ with gr.Tab("Greatest Papers"):
284
+ greatest_count = gr.Textbox(label="Number of Papers Fetched")
285
+ greatest_html = gr.HTML()
286
+ greatest_button = gr.Button("Refresh Leaderboard")
287
+ greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
288
+
289
+ download_button = gr.Button("πŸ“š Download All Papers", variant="primary")
290
+ download_output = gr.Textbox(label="Download Status")
291
+ download_links = gr.HTML(label="Download Links")
292
+ text_output = gr.Code(label="Paper Contents", language="python")
293
+ download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links, text_output])
294
+
295
+ # Load initial data for all tabs
296
+ demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
297
+
298
+ # πŸš€ Launch the Gradio interface with a public link
299
+ print("🎭 Launching the Papers Leaderboard! Get ready for a wild ride through the land of academia! πŸŽ’πŸ“š")
300
+ demo.launch(share=True)