awacke1 commited on
Commit
2b31c18
·
verified ·
1 Parent(s): 0d28f6f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -0
app.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import gradio as gr
5
+ import time
6
+ import os
7
+ import json
8
+ import PyPDF2
9
+ import io
10
+ import asyncio
11
+ import aiohttp
12
+ import aiofiles
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ import re
15
+ from datetime import datetime
16
+ import zipfile
17
+ import base64
18
+
19
+ def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
20
+ base_url = "https://paperswithcode.com"
21
+ session = requests.Session()
22
+ headers = {
23
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
24
+ 'Cache-Control': 'no-cache'
25
+ }
26
+ print("Time run at : ", time.ctime())
27
+ offset = 0
28
+ data_list = {}
29
+ break_duplicate = 10
30
+
31
+ while True:
32
+ response = session.get(url, headers=headers, params={'page': offset})
33
+ if response.status_code != 200:
34
+ print('Failed to retrieve data')
35
+ break
36
+ soup = BeautifulSoup(response.text, 'html.parser')
37
+ paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
38
+ if not paper_info:
39
+ break
40
+ for ppr in paper_info:
41
+ title = ppr.find('h1').text.strip()
42
+
43
+ if "paper" in ppr.find('a')['href']:
44
+ link = base_url + ppr.find('a')['href']
45
+ else:
46
+ link = ppr.find('a')['href']
47
+ Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
48
+ pdf_link = ''
49
+ try:
50
+ response_link = session.get(link, headers=headers)
51
+ soup_link = BeautifulSoup(response_link.text, 'html.parser')
52
+ paper_info_link = soup_link.find_all('div', class_='paper-abstract')
53
+ pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
54
+ except:
55
+ pass
56
+ if title not in data_list:
57
+ data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
58
+ else:
59
+ break_duplicate -= 1
60
+ if break_duplicate == 0:
61
+ return data_list
62
+ offset += 1
63
+ progress.update(offset)
64
+ print('Data retrieval complete')
65
+ return data_list
66
+
67
+ def load_cached_data(cache_file):
68
+ if os.path.exists(cache_file):
69
+ with open(cache_file, 'r') as f:
70
+ return json.load(f)
71
+ return None
72
+
73
+ def save_cached_data(data, cache_file):
74
+ with open(cache_file, 'w') as f:
75
+ json.dump(data, f)
76
+
77
+ def format_dataframe(data):
78
+ df = pd.DataFrame(data).T
79
+ df['title'] = df.index
80
+ df = df[['title', 'Github Star', 'link', 'pdf_link']]
81
+ # Sort the dataframe by 'Github Star' in descending order
82
+ df = df.sort_values(by='Github Star', ascending=False)
83
+ df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
84
+ df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
85
+ return df
86
+
87
+ def load_and_cache_data(url, cache_file):
88
+ cached_data = load_cached_data(cache_file)
89
+
90
+ if cached_data:
91
+ print(f"Loading cached data from {cache_file}")
92
+ return cached_data
93
+
94
+ print(f"Fetching new data from {url}")
95
+ new_data = get_rank_papers(url)
96
+ save_cached_data(new_data, cache_file)
97
+ return new_data
98
+
99
+ def update_display(category):
100
+ cache_file = f"{category}_papers_cache.json"
101
+ url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
102
+
103
+ data = load_and_cache_data(url, cache_file)
104
+ df = format_dataframe(data)
105
+
106
+ return len(df), df.to_html(escape=False, index=False)
107
+
108
+ def load_all_data():
109
+ top_count, top_html = update_display("top")
110
+ new_count, new_html = update_display("latest")
111
+ greatest_count, greatest_html = update_display("greatest")
112
+ return top_count, top_html, new_count, new_html, greatest_count, greatest_html
113
+
114
+ def safe_filename(title):
115
+ """Convert a string to a safe filename."""
116
+ return re.sub(r'[^\w\-_\. ]', '_', title)
117
+
118
+ def create_date_directory():
119
+ """Create a directory named with the current date."""
120
+ date_str = datetime.now().strftime("%Y-%m-%d")
121
+ os.makedirs(date_str, exist_ok=True)
122
+ return date_str
123
+
124
+ async def download_and_save_pdf(session, title, paper_info, directory):
125
+ pdf_url = paper_info['pdf_link']
126
+ if not pdf_url:
127
+ return f"No PDF link available for: {title}", None
128
+
129
+ try:
130
+ timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
131
+ async with session.get(pdf_url, timeout=timeout) as response:
132
+ if response.status != 200:
133
+ return f"Failed to download PDF for {title}: HTTP {response.status}", None
134
+ pdf_content = await response.read()
135
+
136
+ if len(pdf_content) < 2048: # Check if the PDF is less than 2KB
137
+ return f"Downloaded PDF for {title} is too small (less than 2KB). Skipping.", None
138
+
139
+ safe_title = safe_filename(title)
140
+ filename = f"{safe_title}.pdf"
141
+ filepath = os.path.join(directory, filename)
142
+
143
+ async with aiofiles.open(filepath, 'wb') as f:
144
+ await f.write(pdf_content)
145
+
146
+ return f"Successfully saved: {filename}", filepath
147
+ except asyncio.TimeoutError:
148
+ return f"Timeout while downloading PDF for {title}", None
149
+ except Exception as e:
150
+ return f"Error saving PDF for {title}: {str(e)}", None
151
+
152
+ async def process_papers(data, directory, progress=gr.Progress()):
153
+ async with aiohttp.ClientSession() as session:
154
+ tasks = []
155
+ for title, paper_info in data.items():
156
+ task = asyncio.ensure_future(download_and_save_pdf(session, title, paper_info, directory))
157
+ tasks.append(task)
158
+
159
+ results = []
160
+ successful_downloads = []
161
+ errors = []
162
+ for i, task in enumerate(asyncio.as_completed(tasks), start=1):
163
+ result, filepath = await task
164
+ results.append(result)
165
+ if filepath:
166
+ successful_downloads.append(filepath)
167
+ else:
168
+ errors.append(result)
169
+ progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
170
+
171
+ return results, successful_downloads, errors
172
+
173
+ def zip_directory(files_to_zip, directory):
174
+ """Zip the specified files."""
175
+ zip_filename = f"{directory}.zip"
176
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
177
+ for file in files_to_zip:
178
+ zipf.write(file, os.path.relpath(file, os.path.join(directory, '..')))
179
+ return zip_filename
180
+
181
+ def get_base64_download_link(file_path):
182
+ """Create a base64 download link for a file."""
183
+ with open(file_path, "rb") as file:
184
+ content = file.read()
185
+ b64 = base64.b64encode(content).decode()
186
+ return f'<a href="data:application/zip;base64,{b64}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
187
+
188
+ def get_existing_zip_links():
189
+ """Get download links for existing zip files."""
190
+ links = []
191
+ for file in os.listdir('.'):
192
+ if file.endswith('.zip') and os.path.isfile(file):
193
+ links.append(get_base64_download_link(file))
194
+ return "<br>".join(links)
195
+
196
+ def download_all_papers(progress=gr.Progress()):
197
+ all_data = {}
198
+ for category in ["top", "latest", "greatest"]:
199
+ cache_file = f"{category}_papers_cache.json"
200
+ data = load_cached_data(cache_file)
201
+ if data:
202
+ all_data.update(data)
203
+
204
+ date_directory = create_date_directory()
205
+ results, successful_downloads, errors = asyncio.run(process_papers(all_data, date_directory, progress))
206
+
207
+ if successful_downloads:
208
+ zip_file = zip_directory(successful_downloads, date_directory)
209
+ download_link = get_base64_download_link(zip_file)
210
+ else:
211
+ download_link = "No papers were successfully downloaded."
212
+
213
+ existing_links = get_existing_zip_links()
214
+
215
+ summary = f"Papers processed: {len(all_data)}\n"
216
+ summary += f"Successfully downloaded: {len(successful_downloads)}\n"
217
+ summary += f"Errors: {len(errors)}\n\n"
218
+ summary += "Error List:\n" + "\n".join(errors) if errors else "No errors occurred."
219
+
220
+ return summary, f"{download_link}<br><br>Previous downloads:<br>{existing_links}"
221
+
222
+ with gr.Blocks() as demo:
223
+ gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
224
+
225
+ with gr.Tab("Top Trending Papers"):
226
+ top_count = gr.Textbox(label="Number of Papers Fetched")
227
+ top_html = gr.HTML()
228
+ top_button = gr.Button("Refresh Leaderboard")
229
+ top_button.click(fn=lambda: update_display("top"), inputs=None, outputs=[top_count, top_html])
230
+
231
+ with gr.Tab("New Papers"):
232
+ new_count = gr.Textbox(label="Number of Papers Fetched")
233
+ new_html = gr.HTML()
234
+ new_button = gr.Button("Refresh Leaderboard")
235
+ new_button.click(fn=lambda: update_display("latest"), inputs=None, outputs=[new_count, new_html])
236
+
237
+ with gr.Tab("Greatest Papers"):
238
+ greatest_count = gr.Textbox(label="Number of Papers Fetched")
239
+ greatest_html = gr.HTML()
240
+ greatest_button = gr.Button("Refresh Leaderboard")
241
+ greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
242
+
243
+ download_button = gr.Button("📚 Download All Papers", variant="primary")
244
+ download_output = gr.Textbox(label="Download Status")
245
+ download_links = gr.HTML(label="Download Links")
246
+ download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links])
247
+
248
+ # Load initial data for all tabs
249
+ demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
250
+
251
+ # Launch the Gradio interface with a public link
252
+ demo.launch(share=True)