Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,16 @@ import gradio as gr
|
|
5 |
import time
|
6 |
import os
|
7 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
10 |
base_url = "https://paperswithcode.com"
|
@@ -99,6 +109,103 @@ def load_all_data():
|
|
99 |
greatest_count, greatest_html = update_display("greatest")
|
100 |
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
with gr.Blocks() as demo:
|
103 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
104 |
|
@@ -120,6 +227,11 @@ with gr.Blocks() as demo:
|
|
120 |
greatest_button = gr.Button("Refresh Leaderboard")
|
121 |
greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
|
122 |
|
|
|
|
|
|
|
|
|
|
|
123 |
# Load initial data for all tabs
|
124 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
125 |
|
|
|
5 |
import time
|
6 |
import os
|
7 |
import json
|
8 |
+
import PyPDF2
|
9 |
+
import io
|
10 |
+
import asyncio
|
11 |
+
import aiohttp
|
12 |
+
import aiofiles
|
13 |
+
from concurrent.futures import ThreadPoolExecutor
|
14 |
+
import re
|
15 |
+
from datetime import datetime
|
16 |
+
import zipfile
|
17 |
+
import base64
|
18 |
|
19 |
def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
20 |
base_url = "https://paperswithcode.com"
|
|
|
109 |
greatest_count, greatest_html = update_display("greatest")
|
110 |
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
111 |
|
112 |
+
def safe_filename(title):
|
113 |
+
"""Convert a string to a safe filename."""
|
114 |
+
return re.sub(r'[^\w\-_\. ]', '_', title)
|
115 |
+
|
116 |
+
def create_date_directory():
|
117 |
+
"""Create a directory named with the current date."""
|
118 |
+
date_str = datetime.now().strftime("%Y-%m-%d")
|
119 |
+
os.makedirs(date_str, exist_ok=True)
|
120 |
+
return date_str
|
121 |
+
|
122 |
+
async def download_and_save_pdf(session, title, paper_info, directory):
|
123 |
+
pdf_url = paper_info['pdf_link']
|
124 |
+
if not pdf_url:
|
125 |
+
return f"No PDF link available for: {title}"
|
126 |
+
|
127 |
+
try:
|
128 |
+
async with session.get(pdf_url) as response:
|
129 |
+
pdf_content = await response.read()
|
130 |
+
|
131 |
+
if len(pdf_content) < 2048: # Check if the PDF is less than 2KB
|
132 |
+
return f"Downloaded PDF for {title} is too small (less than 2KB). Skipping."
|
133 |
+
|
134 |
+
safe_title = safe_filename(title)
|
135 |
+
filename = f"{safe_title}.pdf"
|
136 |
+
filepath = os.path.join(directory, filename)
|
137 |
+
|
138 |
+
async with aiofiles.open(filepath, 'wb') as f:
|
139 |
+
await f.write(pdf_content)
|
140 |
+
|
141 |
+
return f"Successfully saved: {filename}"
|
142 |
+
except Exception as e:
|
143 |
+
return f"Error saving PDF for {title}: {str(e)}"
|
144 |
+
|
145 |
+
async def process_papers(data, directory, progress=gr.Progress()):
|
146 |
+
async with aiohttp.ClientSession() as session:
|
147 |
+
tasks = []
|
148 |
+
for title, paper_info in data.items():
|
149 |
+
task = asyncio.ensure_future(download_and_save_pdf(session, title, paper_info, directory))
|
150 |
+
tasks.append(task)
|
151 |
+
|
152 |
+
results = []
|
153 |
+
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
154 |
+
result = await task
|
155 |
+
results.append(result)
|
156 |
+
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
157 |
+
|
158 |
+
return "\n".join(results)
|
159 |
+
|
160 |
+
def zip_directory(directory):
|
161 |
+
"""Zip the entire directory, excluding files smaller than 2KB."""
|
162 |
+
zip_filename = f"{directory}.zip"
|
163 |
+
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
164 |
+
for root, _, files in os.walk(directory):
|
165 |
+
for file in files:
|
166 |
+
file_path = os.path.join(root, file)
|
167 |
+
if os.path.getsize(file_path) >= 2048: # Only include files 2KB or larger
|
168 |
+
zipf.write(file_path,
|
169 |
+
os.path.relpath(file_path,
|
170 |
+
os.path.join(directory, '..')))
|
171 |
+
return zip_filename
|
172 |
+
|
173 |
+
def get_base64_download_link(file_path):
|
174 |
+
"""Create a base64 download link for a file."""
|
175 |
+
with open(file_path, "rb") as file:
|
176 |
+
content = file.read()
|
177 |
+
b64 = base64.b64encode(content).decode()
|
178 |
+
return f'<a href="data:application/zip;base64,{b64}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
|
179 |
+
|
180 |
+
def get_existing_zip_links():
|
181 |
+
"""Get download links for existing zip files."""
|
182 |
+
links = []
|
183 |
+
for file in os.listdir('.'):
|
184 |
+
if file.endswith('.zip') and os.path.isfile(file):
|
185 |
+
links.append(get_base64_download_link(file))
|
186 |
+
return "<br>".join(links)
|
187 |
+
|
188 |
+
def download_all_papers(progress=gr.Progress()):
|
189 |
+
all_data = {}
|
190 |
+
for category in ["top", "latest", "greatest"]:
|
191 |
+
cache_file = f"{category}_papers_cache.json"
|
192 |
+
data = load_cached_data(cache_file)
|
193 |
+
if data:
|
194 |
+
all_data.update(data)
|
195 |
+
|
196 |
+
date_directory = create_date_directory()
|
197 |
+
results = asyncio.run(process_papers(all_data, date_directory, progress))
|
198 |
+
|
199 |
+
zip_file = zip_directory(date_directory)
|
200 |
+
download_link = get_base64_download_link(zip_file)
|
201 |
+
|
202 |
+
existing_links = get_existing_zip_links()
|
203 |
+
|
204 |
+
# Count successful downloads
|
205 |
+
successful_downloads = sum(1 for result in results.split('\n') if result.startswith("Successfully saved:"))
|
206 |
+
|
207 |
+
return f"Papers downloaded: {successful_downloads} out of {len(all_data)}\nAll papers have been processed and saved in {zip_file}\n\n{results}", f"{download_link}<br><br>Previous downloads:<br>{existing_links}"
|
208 |
+
|
209 |
with gr.Blocks() as demo:
|
210 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
211 |
|
|
|
227 |
greatest_button = gr.Button("Refresh Leaderboard")
|
228 |
greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
|
229 |
|
230 |
+
download_button = gr.Button("📚 Download All Papers", variant="primary")
|
231 |
+
download_output = gr.Textbox(label="Download Status")
|
232 |
+
download_links = gr.HTML(label="Download Links")
|
233 |
+
download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links])
|
234 |
+
|
235 |
# Load initial data for all tabs
|
236 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
237 |
|