Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,16 +5,6 @@ import gradio as gr
|
|
5 |
import time
|
6 |
import os
|
7 |
import json
|
8 |
-
import PyPDF2
|
9 |
-
import io
|
10 |
-
import asyncio
|
11 |
-
import aiohttp
|
12 |
-
import aiofiles
|
13 |
-
from concurrent.futures import ThreadPoolExecutor
|
14 |
-
import re
|
15 |
-
from datetime import datetime
|
16 |
-
import zipfile
|
17 |
-
import base64
|
18 |
|
19 |
def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
20 |
base_url = "https://paperswithcode.com"
|
@@ -109,103 +99,6 @@ def load_all_data():
|
|
109 |
greatest_count, greatest_html = update_display("greatest")
|
110 |
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
111 |
|
112 |
-
def safe_filename(title):
|
113 |
-
"""Convert a string to a safe filename."""
|
114 |
-
return re.sub(r'[^\w\-_\. ]', '_', title)
|
115 |
-
|
116 |
-
def create_date_directory():
|
117 |
-
"""Create a directory named with the current date."""
|
118 |
-
date_str = datetime.now().strftime("%Y-%m-%d")
|
119 |
-
os.makedirs(date_str, exist_ok=True)
|
120 |
-
return date_str
|
121 |
-
|
122 |
-
async def download_and_save_pdf(session, title, paper_info, directory):
|
123 |
-
pdf_url = paper_info['pdf_link']
|
124 |
-
if not pdf_url:
|
125 |
-
return f"No PDF link available for: {title}"
|
126 |
-
|
127 |
-
try:
|
128 |
-
async with session.get(pdf_url) as response:
|
129 |
-
pdf_content = await response.read()
|
130 |
-
|
131 |
-
if len(pdf_content) < 2048: # Check if the PDF is less than 2KB
|
132 |
-
return f"Downloaded PDF for {title} is too small (less than 2KB). Skipping."
|
133 |
-
|
134 |
-
safe_title = safe_filename(title)
|
135 |
-
filename = f"{safe_title}.pdf"
|
136 |
-
filepath = os.path.join(directory, filename)
|
137 |
-
|
138 |
-
async with aiofiles.open(filepath, 'wb') as f:
|
139 |
-
await f.write(pdf_content)
|
140 |
-
|
141 |
-
return f"Successfully saved: {filename}"
|
142 |
-
except Exception as e:
|
143 |
-
return f"Error saving PDF for {title}: {str(e)}"
|
144 |
-
|
145 |
-
async def process_papers(data, directory, progress=gr.Progress()):
|
146 |
-
async with aiohttp.ClientSession() as session:
|
147 |
-
tasks = []
|
148 |
-
for title, paper_info in data.items():
|
149 |
-
task = asyncio.ensure_future(download_and_save_pdf(session, title, paper_info, directory))
|
150 |
-
tasks.append(task)
|
151 |
-
|
152 |
-
results = []
|
153 |
-
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
154 |
-
result = await task
|
155 |
-
results.append(result)
|
156 |
-
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
157 |
-
|
158 |
-
return "\n".join(results)
|
159 |
-
|
160 |
-
def zip_directory(directory):
|
161 |
-
"""Zip the entire directory, excluding files smaller than 2KB."""
|
162 |
-
zip_filename = f"{directory}.zip"
|
163 |
-
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
164 |
-
for root, _, files in os.walk(directory):
|
165 |
-
for file in files:
|
166 |
-
file_path = os.path.join(root, file)
|
167 |
-
if os.path.getsize(file_path) >= 2048: # Only include files 2KB or larger
|
168 |
-
zipf.write(file_path,
|
169 |
-
os.path.relpath(file_path,
|
170 |
-
os.path.join(directory, '..')))
|
171 |
-
return zip_filename
|
172 |
-
|
173 |
-
def get_base64_download_link(file_path):
|
174 |
-
"""Create a base64 download link for a file."""
|
175 |
-
with open(file_path, "rb") as file:
|
176 |
-
content = file.read()
|
177 |
-
b64 = base64.b64encode(content).decode()
|
178 |
-
return f'<a href="data:application/zip;base64,{b64}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
|
179 |
-
|
180 |
-
def get_existing_zip_links():
|
181 |
-
"""Get download links for existing zip files."""
|
182 |
-
links = []
|
183 |
-
for file in os.listdir('.'):
|
184 |
-
if file.endswith('.zip') and os.path.isfile(file):
|
185 |
-
links.append(get_base64_download_link(file))
|
186 |
-
return "<br>".join(links)
|
187 |
-
|
188 |
-
def download_all_papers(progress=gr.Progress()):
|
189 |
-
all_data = {}
|
190 |
-
for category in ["top", "latest", "greatest"]:
|
191 |
-
cache_file = f"{category}_papers_cache.json"
|
192 |
-
data = load_cached_data(cache_file)
|
193 |
-
if data:
|
194 |
-
all_data.update(data)
|
195 |
-
|
196 |
-
date_directory = create_date_directory()
|
197 |
-
results = asyncio.run(process_papers(all_data, date_directory, progress))
|
198 |
-
|
199 |
-
zip_file = zip_directory(date_directory)
|
200 |
-
download_link = get_base64_download_link(zip_file)
|
201 |
-
|
202 |
-
existing_links = get_existing_zip_links()
|
203 |
-
|
204 |
-
# Count successful downloads
|
205 |
-
successful_downloads = sum(1 for result in results.split('\n') if result.startswith("Successfully saved:"))
|
206 |
-
|
207 |
-
return f"Papers downloaded: {successful_downloads} out of {len(all_data)}\nAll papers have been processed and saved in {zip_file}\n\n{results}", f"{download_link}<br><br>Previous downloads:<br>{existing_links}"
|
208 |
-
|
209 |
with gr.Blocks() as demo:
|
210 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
211 |
|
@@ -227,11 +120,6 @@ with gr.Blocks() as demo:
|
|
227 |
greatest_button = gr.Button("Refresh Leaderboard")
|
228 |
greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
|
229 |
|
230 |
-
download_button = gr.Button("📚 Download All Papers", variant="primary")
|
231 |
-
download_output = gr.Textbox(label="Download Status")
|
232 |
-
download_links = gr.HTML(label="Download Links")
|
233 |
-
download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links])
|
234 |
-
|
235 |
# Load initial data for all tabs
|
236 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
237 |
|
|
|
5 |
import time
|
6 |
import os
|
7 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
10 |
base_url = "https://paperswithcode.com"
|
|
|
99 |
greatest_count, greatest_html = update_display("greatest")
|
100 |
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
with gr.Blocks() as demo:
|
103 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
104 |
|
|
|
120 |
greatest_button = gr.Button("Refresh Leaderboard")
|
121 |
greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
|
122 |
|
|
|
|
|
|
|
|
|
|
|
123 |
# Load initial data for all tabs
|
124 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
125 |
|