awacke1 commited on
Commit
657eccf
·
verified ·
1 Parent(s): cdbe150

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py CHANGED
@@ -5,6 +5,16 @@ import gradio as gr
5
  import time
6
  import os
7
  import json
 
 
 
 
 
 
 
 
 
 
8
 
9
  def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
10
  base_url = "https://paperswithcode.com"
@@ -99,6 +109,103 @@ def load_all_data():
99
  greatest_count, greatest_html = update_display("greatest")
100
  return top_count, top_html, new_count, new_html, greatest_count, greatest_html
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  with gr.Blocks() as demo:
103
  gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
104
 
@@ -120,6 +227,11 @@ with gr.Blocks() as demo:
120
  greatest_button = gr.Button("Refresh Leaderboard")
121
  greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
122
 
 
 
 
 
 
123
  # Load initial data for all tabs
124
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
125
 
 
5
  import time
6
  import os
7
  import json
8
+ import PyPDF2
9
+ import io
10
+ import asyncio
11
+ import aiohttp
12
+ import aiofiles
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ import re
15
+ from datetime import datetime
16
+ import zipfile
17
+ import base64
18
 
19
  def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
20
  base_url = "https://paperswithcode.com"
 
109
  greatest_count, greatest_html = update_display("greatest")
110
  return top_count, top_html, new_count, new_html, greatest_count, greatest_html
111
 
112
+ def safe_filename(title):
113
+ """Convert a string to a safe filename."""
114
+ return re.sub(r'[^\w\-_\. ]', '_', title)
115
+
116
+ def create_date_directory():
117
+ """Create a directory named with the current date."""
118
+ date_str = datetime.now().strftime("%Y-%m-%d")
119
+ os.makedirs(date_str, exist_ok=True)
120
+ return date_str
121
+
122
+ async def download_and_save_pdf(session, title, paper_info, directory):
123
+ pdf_url = paper_info['pdf_link']
124
+ if not pdf_url:
125
+ return f"No PDF link available for: {title}"
126
+
127
+ try:
128
+ async with session.get(pdf_url) as response:
129
+ pdf_content = await response.read()
130
+
131
+ if len(pdf_content) < 2048: # Check if the PDF is less than 2KB
132
+ return f"Downloaded PDF for {title} is too small (less than 2KB). Skipping."
133
+
134
+ safe_title = safe_filename(title)
135
+ filename = f"{safe_title}.pdf"
136
+ filepath = os.path.join(directory, filename)
137
+
138
+ async with aiofiles.open(filepath, 'wb') as f:
139
+ await f.write(pdf_content)
140
+
141
+ return f"Successfully saved: {filename}"
142
+ except Exception as e:
143
+ return f"Error saving PDF for {title}: {str(e)}"
144
+
145
+ async def process_papers(data, directory, progress=gr.Progress()):
146
+ async with aiohttp.ClientSession() as session:
147
+ tasks = []
148
+ for title, paper_info in data.items():
149
+ task = asyncio.ensure_future(download_and_save_pdf(session, title, paper_info, directory))
150
+ tasks.append(task)
151
+
152
+ results = []
153
+ for i, task in enumerate(asyncio.as_completed(tasks), start=1):
154
+ result = await task
155
+ results.append(result)
156
+ progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
157
+
158
+ return "\n".join(results)
159
+
160
+ def zip_directory(directory):
161
+ """Zip the entire directory, excluding files smaller than 2KB."""
162
+ zip_filename = f"{directory}.zip"
163
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
164
+ for root, _, files in os.walk(directory):
165
+ for file in files:
166
+ file_path = os.path.join(root, file)
167
+ if os.path.getsize(file_path) >= 2048: # Only include files 2KB or larger
168
+ zipf.write(file_path,
169
+ os.path.relpath(file_path,
170
+ os.path.join(directory, '..')))
171
+ return zip_filename
172
+
173
+ def get_base64_download_link(file_path):
174
+ """Create a base64 download link for a file."""
175
+ with open(file_path, "rb") as file:
176
+ content = file.read()
177
+ b64 = base64.b64encode(content).decode()
178
+ return f'<a href="data:application/zip;base64,{b64}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
179
+
180
+ def get_existing_zip_links():
181
+ """Get download links for existing zip files."""
182
+ links = []
183
+ for file in os.listdir('.'):
184
+ if file.endswith('.zip') and os.path.isfile(file):
185
+ links.append(get_base64_download_link(file))
186
+ return "<br>".join(links)
187
+
188
+ def download_all_papers(progress=gr.Progress()):
189
+ all_data = {}
190
+ for category in ["top", "latest", "greatest"]:
191
+ cache_file = f"{category}_papers_cache.json"
192
+ data = load_cached_data(cache_file)
193
+ if data:
194
+ all_data.update(data)
195
+
196
+ date_directory = create_date_directory()
197
+ results = asyncio.run(process_papers(all_data, date_directory, progress))
198
+
199
+ zip_file = zip_directory(date_directory)
200
+ download_link = get_base64_download_link(zip_file)
201
+
202
+ existing_links = get_existing_zip_links()
203
+
204
+ # Count successful downloads
205
+ successful_downloads = sum(1 for result in results.split('\n') if result.startswith("Successfully saved:"))
206
+
207
+ return f"Papers downloaded: {successful_downloads} out of {len(all_data)}\nAll papers have been processed and saved in {zip_file}\n\n{results}", f"{download_link}<br><br>Previous downloads:<br>{existing_links}"
208
+
209
  with gr.Blocks() as demo:
210
  gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
211
 
 
227
  greatest_button = gr.Button("Refresh Leaderboard")
228
  greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
229
 
230
+ download_button = gr.Button("📚 Download All Papers", variant="primary")
231
+ download_output = gr.Textbox(label="Download Status")
232
+ download_links = gr.HTML(label="Download Links")
233
+ download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links])
234
+
235
  # Load initial data for all tabs
236
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
237