Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,15 +5,10 @@ import gradio as gr
|
|
5 |
import time
|
6 |
import os
|
7 |
import json
|
8 |
-
import PyPDF2
|
9 |
-
import io
|
10 |
-
import asyncio
|
11 |
import aiohttp
|
12 |
import aiofiles
|
13 |
import re
|
14 |
from datetime import datetime
|
15 |
-
import base64
|
16 |
-
import zipfile
|
17 |
|
18 |
# π§ββοΈ Magical Utility Functions π§ββοΈ
|
19 |
|
@@ -21,19 +16,6 @@ def safe_filename(title):
|
|
21 |
"""Convert a string to a safe filename. No more 'file not found' nightmares! π
ββοΈπ"""
|
22 |
return re.sub(r'[^\w\-_\. ]', '_', title)
|
23 |
|
24 |
-
def create_timestamp_directory():
|
25 |
-
"""Create a directory named with the current date and time. It's like a time capsule for your downloads! ποΈπ¦"""
|
26 |
-
date_str = datetime.now().strftime("%m-%d-%Y-%H-%M")
|
27 |
-
os.makedirs(date_str, exist_ok=True)
|
28 |
-
return date_str
|
29 |
-
|
30 |
-
def get_base64_download_link(file_path, filename):
|
31 |
-
"""Create a base64 download link for a binary file. It's like teleportation for your files! ππ²"""
|
32 |
-
with open(file_path, "rb") as f:
|
33 |
-
content = f.read()
|
34 |
-
b64 = base64.b64encode(content).decode()
|
35 |
-
return f'<a href="data:application/zip;base64,{b64}" download="{filename}">Download {filename}</a>'
|
36 |
-
|
37 |
# π¬ Animated Banner Messages π¬
|
38 |
def animated_banner(message, emoji):
|
39 |
"""Create an animated banner message. It's like a tiny parade for your console! ππ©"""
|
@@ -170,73 +152,51 @@ def load_all_data():
|
|
170 |
greatest_count, greatest_html = update_display("greatest")
|
171 |
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
172 |
|
173 |
-
# π Asynchronous
|
174 |
|
175 |
-
async def
|
176 |
-
"""Download
|
177 |
-
|
178 |
-
if not
|
179 |
-
return f"π« No
|
180 |
|
181 |
try:
|
182 |
timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
|
183 |
-
async with session.get(
|
184 |
if response.status != 200:
|
185 |
-
return f"π¨ Failed to grab
|
186 |
-
|
187 |
-
|
188 |
-
file_length = len(pdf_content)
|
189 |
-
if file_length < 5000: # Check if the PDF is less than 5KB
|
190 |
-
return f"π PDF for {title} is tiny ({file_length} bytes). It's like a paper for ants! ππ", None
|
191 |
|
192 |
-
#
|
193 |
-
|
194 |
-
pdf_filename = f"{safe_title}.pdf"
|
195 |
-
pdf_filepath = os.path.join(directory, pdf_filename)
|
196 |
|
197 |
-
|
198 |
-
await f.write(pdf_content)
|
199 |
-
|
200 |
-
return f"π Successfully downloaded: {pdf_filename} (File length: {file_length} bytes).", pdf_filepath
|
201 |
except asyncio.TimeoutError:
|
202 |
-
return f"β³ Timeout for {title}. The
|
203 |
except Exception as e:
|
204 |
-
return f"π₯ Oops! Error downloading {title}: {str(e)}. Gremlins in the system! π οΈ", None
|
205 |
|
206 |
-
async def
|
207 |
-
"""Process multiple papers asynchronously by downloading their
|
208 |
async with aiohttp.ClientSession() as session:
|
209 |
tasks = []
|
210 |
for title, paper_info in data.items():
|
211 |
-
task = asyncio.ensure_future(
|
212 |
tasks.append(task)
|
213 |
|
214 |
results = []
|
215 |
-
|
216 |
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
217 |
-
result,
|
218 |
results.append(result)
|
219 |
-
if
|
220 |
-
|
221 |
progress(i / len(tasks), f"π Processed {i}/{len(tasks)} papers. Downloading...")
|
222 |
|
223 |
-
return results,
|
224 |
-
|
225 |
-
def zip_files(directory):
|
226 |
-
"""Zip all files in the given directory."""
|
227 |
-
zip_filename = f"{directory}.zip"
|
228 |
-
zip_filepath = os.path.join(directory, zip_filename)
|
229 |
-
|
230 |
-
with zipfile.ZipFile(zip_filepath, 'w') as zipf:
|
231 |
-
for root, _, files in os.walk(directory):
|
232 |
-
for file in files:
|
233 |
-
file_path = os.path.join(root, file)
|
234 |
-
zipf.write(file_path, arcname=file)
|
235 |
-
|
236 |
-
return zip_filepath
|
237 |
|
238 |
-
def
|
239 |
-
"""Download and
|
240 |
all_data = {}
|
241 |
for category in ["top", "latest", "greatest"]:
|
242 |
cache_file = f"{category}_papers_cache.json"
|
@@ -244,24 +204,14 @@ def download_all_papers(progress=gr.Progress()):
|
|
244 |
if data:
|
245 |
all_data.update(data)
|
246 |
|
247 |
-
#
|
248 |
-
|
249 |
-
|
250 |
-
# Download papers
|
251 |
-
results, pdf_files = asyncio.run(process_papers(all_data, date_directory, progress))
|
252 |
-
|
253 |
-
# Zip the directory
|
254 |
-
zip_filepath = zip_files(date_directory)
|
255 |
-
|
256 |
-
# Create a download link
|
257 |
-
download_link = get_base64_download_link(zip_filepath, f"{date_directory}.zip")
|
258 |
|
259 |
summary = f"π Papers processed: {len(all_data)} (We're basically librarians now!)\n"
|
260 |
-
summary += f"β
Successfully downloaded
|
261 |
-
summary += f"β Errors: {len(results) - len(
|
262 |
-
summary += download_link
|
263 |
|
264 |
-
return summary, "
|
265 |
|
266 |
# π Gradio Interface: Where the Magic Happens π
|
267 |
|
@@ -286,11 +236,10 @@ with gr.Blocks() as demo:
|
|
286 |
greatest_button = gr.Button("Refresh Leaderboard")
|
287 |
greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
|
288 |
|
289 |
-
download_button = gr.Button("π Download All
|
290 |
download_output = gr.Textbox(label="Download Status")
|
291 |
-
|
292 |
-
|
293 |
-
download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links, text_output])
|
294 |
|
295 |
# Load initial data for all tabs
|
296 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
|
|
5 |
import time
|
6 |
import os
|
7 |
import json
|
|
|
|
|
|
|
8 |
import aiohttp
|
9 |
import aiofiles
|
10 |
import re
|
11 |
from datetime import datetime
|
|
|
|
|
12 |
|
13 |
# π§ββοΈ Magical Utility Functions π§ββοΈ
|
14 |
|
|
|
16 |
"""Convert a string to a safe filename. No more 'file not found' nightmares! π
ββοΈπ"""
|
17 |
return re.sub(r'[^\w\-_\. ]', '_', title)
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# π¬ Animated Banner Messages π¬
|
20 |
def animated_banner(message, emoji):
|
21 |
"""Create an animated banner message. It's like a tiny parade for your console! ππ©"""
|
|
|
152 |
greatest_count, greatest_html = update_display("greatest")
|
153 |
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
154 |
|
155 |
+
# π Asynchronous Web Page Downloading π
|
156 |
|
157 |
+
async def download_webpage(session, title, paper_info):
|
158 |
+
"""Download the webpage content instead of the PDF. It's like browsing, but faster! ππ"""
|
159 |
+
link_url = paper_info['link']
|
160 |
+
if not link_url:
|
161 |
+
return f"π« No link for: {title}. It's playing hide and seek! π", None, None
|
162 |
|
163 |
try:
|
164 |
timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
|
165 |
+
async with session.get(link_url, timeout=timeout) as response:
|
166 |
if response.status != 200:
|
167 |
+
return f"π¨ Failed to grab webpage for {title}: HTTP {response.status}. The internet gremlins strike again! πΉ", None, None
|
168 |
+
page_content = await response.text()
|
|
|
|
|
|
|
|
|
169 |
|
170 |
+
# Combine the content as a Python type representation
|
171 |
+
code_block = f'"""\nTitle: {title}\nLink: {link_url}\n"""\n\n# Webpage Content\n{repr(page_content)}\n'
|
|
|
|
|
172 |
|
173 |
+
return f"π Successfully downloaded webpage for: {title}.", code_block, page_content
|
|
|
|
|
|
|
174 |
except asyncio.TimeoutError:
|
175 |
+
return f"β³ Timeout for {title}. The webpage is playing hard to get! π", None, None
|
176 |
except Exception as e:
|
177 |
+
return f"π₯ Oops! Error downloading {title}: {str(e)}. Gremlins in the system! π οΈ", None, None
|
178 |
|
179 |
+
async def process_webpages(data, progress=gr.Progress()):
|
180 |
+
"""Process multiple papers asynchronously by downloading their webpages. π€ΉββοΈπ"""
|
181 |
async with aiohttp.ClientSession() as session:
|
182 |
tasks = []
|
183 |
for title, paper_info in data.items():
|
184 |
+
task = asyncio.ensure_future(download_webpage(session, title, paper_info))
|
185 |
tasks.append(task)
|
186 |
|
187 |
results = []
|
188 |
+
codes = []
|
189 |
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
190 |
+
result, code_block, page_content = await task
|
191 |
results.append(result)
|
192 |
+
if code_block:
|
193 |
+
codes.append(code_block)
|
194 |
progress(i / len(tasks), f"π Processed {i}/{len(tasks)} papers. Downloading...")
|
195 |
|
196 |
+
return results, codes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
+
def download_all_webpages(progress=gr.Progress()):
|
199 |
+
"""Download and display all paper webpages. It's like hosting a web party, and everyone's invited! ππ"""
|
200 |
all_data = {}
|
201 |
for category in ["top", "latest", "greatest"]:
|
202 |
cache_file = f"{category}_papers_cache.json"
|
|
|
204 |
if data:
|
205 |
all_data.update(data)
|
206 |
|
207 |
+
# Download the webpage content
|
208 |
+
results, code_blocks = asyncio.run(process_webpages(all_data, progress))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
summary = f"π Papers processed: {len(all_data)} (We're basically librarians now!)\n"
|
211 |
+
summary += f"β
Successfully downloaded: {len(code_blocks)} webpages\n"
|
212 |
+
summary += f"β Errors: {len(results) - len(code_blocks)} (Even superheroes have off days)\n\n"
|
|
|
213 |
|
214 |
+
return summary, "\n\n".join(code_blocks)
|
215 |
|
216 |
# π Gradio Interface: Where the Magic Happens π
|
217 |
|
|
|
236 |
greatest_button = gr.Button("Refresh Leaderboard")
|
237 |
greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
|
238 |
|
239 |
+
download_button = gr.Button("π Download All Paper Webpages", variant="primary")
|
240 |
download_output = gr.Textbox(label="Download Status")
|
241 |
+
code_output = gr.Code(label="Paper Webpage Contents", language="python")
|
242 |
+
download_button.click(fn=download_all_webpages, inputs=None, outputs=[download_output, code_output])
|
|
|
243 |
|
244 |
# Load initial data for all tabs
|
245 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|