Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,10 @@ import asyncio
|
|
11 |
import aiohttp
|
12 |
import aiofiles
|
13 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
16 |
base_url = "https://paperswithcode.com"
|
@@ -105,51 +109,76 @@ def load_all_data():
|
|
105 |
greatest_count, greatest_html = update_display("greatest")
|
106 |
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
107 |
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
pdf_url = paper_info['pdf_link']
|
110 |
-
cache_file = f"cache/{title.replace(' ', '_')}.md"
|
111 |
-
|
112 |
-
if os.path.exists(cache_file):
|
113 |
-
async with aiofiles.open(cache_file, 'r') as f:
|
114 |
-
return await f.read()
|
115 |
-
|
116 |
if not pdf_url:
|
117 |
-
return f"
|
118 |
|
119 |
try:
|
120 |
async with session.get(pdf_url) as response:
|
121 |
pdf_content = await response.read()
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
for page in pdf_reader.pages:
|
127 |
-
text += page.extract_text()
|
128 |
|
129 |
-
|
|
|
130 |
|
131 |
-
|
132 |
-
async with aiofiles.open(cache_file, 'w') as f:
|
133 |
-
await f.write(markdown_text)
|
134 |
-
|
135 |
-
return markdown_text
|
136 |
except Exception as e:
|
137 |
-
return f"
|
138 |
|
139 |
-
async def process_papers(data, progress=gr.Progress()):
|
140 |
async with aiohttp.ClientSession() as session:
|
141 |
tasks = []
|
142 |
for title, paper_info in data.items():
|
143 |
-
task = asyncio.ensure_future(
|
144 |
tasks.append(task)
|
145 |
|
146 |
-
|
147 |
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
148 |
-
|
149 |
-
|
150 |
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
151 |
|
152 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
def download_all_papers(progress=gr.Progress()):
|
155 |
all_data = {}
|
@@ -159,12 +188,15 @@ def download_all_papers(progress=gr.Progress()):
|
|
159 |
if data:
|
160 |
all_data.update(data)
|
161 |
|
162 |
-
|
|
|
|
|
|
|
|
|
163 |
|
164 |
-
|
165 |
-
f.write(consolidated_text)
|
166 |
|
167 |
-
return "All papers have been downloaded and
|
168 |
|
169 |
with gr.Blocks() as demo:
|
170 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
@@ -189,8 +221,8 @@ with gr.Blocks() as demo:
|
|
189 |
|
190 |
download_button = gr.Button("📚 Download All Papers", variant="primary")
|
191 |
download_output = gr.Textbox(label="Download Status")
|
192 |
-
|
193 |
-
download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output,
|
194 |
|
195 |
# Load initial data for all tabs
|
196 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
|
|
11 |
import aiohttp
|
12 |
import aiofiles
|
13 |
from concurrent.futures import ThreadPoolExecutor
|
14 |
+
import re
|
15 |
+
from datetime import datetime
|
16 |
+
import zipfile
|
17 |
+
import base64
|
18 |
|
19 |
def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
20 |
base_url = "https://paperswithcode.com"
|
|
|
109 |
greatest_count, greatest_html = update_display("greatest")
|
110 |
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
111 |
|
112 |
+
def safe_filename(title):
|
113 |
+
"""Convert a string to a safe filename."""
|
114 |
+
return re.sub(r'[^\w\-_\. ]', '_', title)
|
115 |
+
|
116 |
+
def create_date_directory():
|
117 |
+
"""Create a directory named with the current date."""
|
118 |
+
date_str = datetime.now().strftime("%Y-%m-%d")
|
119 |
+
os.makedirs(date_str, exist_ok=True)
|
120 |
+
return date_str
|
121 |
+
|
122 |
+
async def download_and_save_pdf(session, title, paper_info, directory):
|
123 |
pdf_url = paper_info['pdf_link']
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
if not pdf_url:
|
125 |
+
return f"No PDF link available for: {title}"
|
126 |
|
127 |
try:
|
128 |
async with session.get(pdf_url) as response:
|
129 |
pdf_content = await response.read()
|
130 |
|
131 |
+
safe_title = safe_filename(title)
|
132 |
+
filename = f"{safe_title}.pdf"
|
133 |
+
filepath = os.path.join(directory, filename)
|
|
|
|
|
134 |
|
135 |
+
async with aiofiles.open(filepath, 'wb') as f:
|
136 |
+
await f.write(pdf_content)
|
137 |
|
138 |
+
return f"Successfully saved: {filename}"
|
|
|
|
|
|
|
|
|
139 |
except Exception as e:
|
140 |
+
return f"Error saving PDF for {title}: {str(e)}"
|
141 |
|
142 |
+
async def process_papers(data, directory, progress=gr.Progress()):
|
143 |
async with aiohttp.ClientSession() as session:
|
144 |
tasks = []
|
145 |
for title, paper_info in data.items():
|
146 |
+
task = asyncio.ensure_future(download_and_save_pdf(session, title, paper_info, directory))
|
147 |
tasks.append(task)
|
148 |
|
149 |
+
results = []
|
150 |
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
151 |
+
result = await task
|
152 |
+
results.append(result)
|
153 |
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
154 |
|
155 |
+
return "\n".join(results)
|
156 |
+
|
157 |
+
def zip_directory(directory):
|
158 |
+
"""Zip the entire directory."""
|
159 |
+
zip_filename = f"{directory}.zip"
|
160 |
+
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
161 |
+
for root, _, files in os.walk(directory):
|
162 |
+
for file in files:
|
163 |
+
zipf.write(os.path.join(root, file),
|
164 |
+
os.path.relpath(os.path.join(root, file),
|
165 |
+
os.path.join(directory, '..')))
|
166 |
+
return zip_filename
|
167 |
+
|
168 |
+
def get_base64_download_link(file_path):
|
169 |
+
"""Create a base64 download link for a file."""
|
170 |
+
with open(file_path, "rb") as file:
|
171 |
+
content = file.read()
|
172 |
+
b64 = base64.b64encode(content).decode()
|
173 |
+
return f'<a href="data:application/zip;base64,{b64}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
|
174 |
+
|
175 |
+
def get_existing_zip_links():
|
176 |
+
"""Get download links for existing zip files."""
|
177 |
+
links = []
|
178 |
+
for file in os.listdir('.'):
|
179 |
+
if file.endswith('.zip') and os.path.isfile(file):
|
180 |
+
links.append(get_base64_download_link(file))
|
181 |
+
return "<br>".join(links)
|
182 |
|
183 |
def download_all_papers(progress=gr.Progress()):
|
184 |
all_data = {}
|
|
|
188 |
if data:
|
189 |
all_data.update(data)
|
190 |
|
191 |
+
date_directory = create_date_directory()
|
192 |
+
results = asyncio.run(process_papers(all_data, date_directory, progress))
|
193 |
+
|
194 |
+
zip_file = zip_directory(date_directory)
|
195 |
+
download_link = get_base64_download_link(zip_file)
|
196 |
|
197 |
+
existing_links = get_existing_zip_links()
|
|
|
198 |
|
199 |
+
return f"All papers have been downloaded and saved in {zip_file}\n\n{results}", f"{download_link}<br><br>Previous downloads:<br>{existing_links}"
|
200 |
|
201 |
with gr.Blocks() as demo:
|
202 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
|
|
221 |
|
222 |
download_button = gr.Button("📚 Download All Papers", variant="primary")
|
223 |
download_output = gr.Textbox(label="Download Status")
|
224 |
+
download_links = gr.HTML(label="Download Links")
|
225 |
+
download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links])
|
226 |
|
227 |
# Load initial data for all tabs
|
228 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|