awacke1 commited on
Commit
60d9046
Β·
verified Β·
1 Parent(s): 7b8a04f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -83
app.py CHANGED
@@ -5,15 +5,10 @@ import gradio as gr
5
  import time
6
  import os
7
  import json
8
- import PyPDF2
9
- import io
10
- import asyncio
11
  import aiohttp
12
  import aiofiles
13
  import re
14
  from datetime import datetime
15
- import base64
16
- import zipfile
17
 
18
  # πŸ§™β€β™‚οΈ Magical Utility Functions πŸ§™β€β™‚οΈ
19
 
@@ -21,19 +16,6 @@ def safe_filename(title):
21
  """Convert a string to a safe filename. No more 'file not found' nightmares! πŸ™…β€β™‚οΈπŸ“"""
22
  return re.sub(r'[^\w\-_\. ]', '_', title)
23
 
24
- def create_timestamp_directory():
25
- """Create a directory named with the current date and time. It's like a time capsule for your downloads! πŸ—“οΈπŸ“¦"""
26
- date_str = datetime.now().strftime("%m-%d-%Y-%H-%M")
27
- os.makedirs(date_str, exist_ok=True)
28
- return date_str
29
-
30
- def get_base64_download_link(file_path, filename):
31
- """Create a base64 download link for a binary file. It's like teleportation for your files! πŸŒŸπŸ“²"""
32
- with open(file_path, "rb") as f:
33
- content = f.read()
34
- b64 = base64.b64encode(content).decode()
35
- return f'<a href="data:application/zip;base64,{b64}" download="{filename}">Download {filename}</a>'
36
-
37
  # 🎬 Animated Banner Messages 🎬
38
  def animated_banner(message, emoji):
39
  """Create an animated banner message. It's like a tiny parade for your console! πŸŽ‰πŸš©"""
@@ -170,73 +152,51 @@ def load_all_data():
170
  greatest_count, greatest_html = update_display("greatest")
171
  return top_count, top_html, new_count, new_html, greatest_count, greatest_html
172
 
173
- # πŸš€ Asynchronous Paper Downloading and Zipping πŸš€
174
 
175
- async def download_pdf(session, title, paper_info, directory):
176
- """Download a PDF file. It's like collecting treasures from the internet! πŸ’ŽπŸ“„"""
177
- pdf_url = paper_info['pdf_link']
178
- if not pdf_url:
179
- return f"🚫 No PDF link for: {title}. It's playing hide and seek! πŸ™ˆ", None
180
 
181
  try:
182
  timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
183
- async with session.get(pdf_url, timeout=timeout) as response:
184
  if response.status != 200:
185
- return f"🚨 Failed to grab PDF for {title}: HTTP {response.status}. The internet gremlins strike again! πŸ‘Ή", None
186
- pdf_content = await response.read()
187
-
188
- file_length = len(pdf_content)
189
- if file_length < 5000: # Check if the PDF is less than 5KB
190
- return f"🐜 PDF for {title} is tiny ({file_length} bytes). It's like a paper for ants! πŸœπŸ“„", None
191
 
192
- # Save the PDF to the directory
193
- safe_title = safe_filename(title)
194
- pdf_filename = f"{safe_title}.pdf"
195
- pdf_filepath = os.path.join(directory, pdf_filename)
196
 
197
- async with aiofiles.open(pdf_filepath, 'wb') as f:
198
- await f.write(pdf_content)
199
-
200
- return f"πŸŽ‰ Successfully downloaded: {pdf_filename} (File length: {file_length} bytes).", pdf_filepath
201
  except asyncio.TimeoutError:
202
- return f"⏳ Timeout for {title}. The PDF is playing hard to get! πŸ’ƒ", None
203
  except Exception as e:
204
- return f"πŸ’₯ Oops! Error downloading {title}: {str(e)}. Gremlins in the system! πŸ› οΈ", None
205
 
206
- async def process_papers(data, directory, progress=gr.Progress()):
207
- """Process multiple papers asynchronously by downloading their PDFs. πŸ€Ήβ€β™‚οΈπŸ“š"""
208
  async with aiohttp.ClientSession() as session:
209
  tasks = []
210
  for title, paper_info in data.items():
211
- task = asyncio.ensure_future(download_pdf(session, title, paper_info, directory))
212
  tasks.append(task)
213
 
214
  results = []
215
- pdf_files = []
216
  for i, task in enumerate(asyncio.as_completed(tasks), start=1):
217
- result, pdf_filepath = await task
218
  results.append(result)
219
- if pdf_filepath:
220
- pdf_files.append(pdf_filepath)
221
  progress(i / len(tasks), f"πŸš€ Processed {i}/{len(tasks)} papers. Downloading...")
222
 
223
- return results, pdf_files
224
-
225
- def zip_files(directory):
226
- """Zip all files in the given directory."""
227
- zip_filename = f"{directory}.zip"
228
- zip_filepath = os.path.join(directory, zip_filename)
229
-
230
- with zipfile.ZipFile(zip_filepath, 'w') as zipf:
231
- for root, _, files in os.walk(directory):
232
- for file in files:
233
- file_path = os.path.join(root, file)
234
- zipf.write(file_path, arcname=file)
235
-
236
- return zip_filepath
237
 
238
- def download_all_papers(progress=gr.Progress()):
239
- """Download and zip all papers. It's like hosting a paper party, and everyone's invited! πŸŽ‰πŸ“š"""
240
  all_data = {}
241
  for category in ["top", "latest", "greatest"]:
242
  cache_file = f"{category}_papers_cache.json"
@@ -244,24 +204,14 @@ def download_all_papers(progress=gr.Progress()):
244
  if data:
245
  all_data.update(data)
246
 
247
- # Create timestamped directory
248
- date_directory = create_timestamp_directory()
249
-
250
- # Download papers
251
- results, pdf_files = asyncio.run(process_papers(all_data, date_directory, progress))
252
-
253
- # Zip the directory
254
- zip_filepath = zip_files(date_directory)
255
-
256
- # Create a download link
257
- download_link = get_base64_download_link(zip_filepath, f"{date_directory}.zip")
258
 
259
  summary = f"πŸ“Š Papers processed: {len(all_data)} (We're basically librarians now!)\n"
260
- summary += f"βœ… Successfully downloaded and zipped: {len(pdf_files)} (Take that, PDF gremlins!)\n"
261
- summary += f"❌ Errors: {len(results) - len(pdf_files)} (Even superheroes have off days)\n\n"
262
- summary += download_link
263
 
264
- return summary, "<br>".join(results)
265
 
266
  # 🎭 Gradio Interface: Where the Magic Happens 🎭
267
 
@@ -286,11 +236,10 @@ with gr.Blocks() as demo:
286
  greatest_button = gr.Button("Refresh Leaderboard")
287
  greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
288
 
289
- download_button = gr.Button("πŸ“š Download All Papers", variant="primary")
290
  download_output = gr.Textbox(label="Download Status")
291
- download_links = gr.HTML(label="Download Links")
292
- text_output = gr.Code(label="Paper Contents", language="python")
293
- download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links, text_output])
294
 
295
  # Load initial data for all tabs
296
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
 
5
  import time
6
  import os
7
  import json
 
 
 
8
  import aiohttp
9
  import aiofiles
10
  import re
11
  from datetime import datetime
 
 
12
 
13
  # πŸ§™β€β™‚οΈ Magical Utility Functions πŸ§™β€β™‚οΈ
14
 
 
16
  """Convert a string to a safe filename. No more 'file not found' nightmares! πŸ™…β€β™‚οΈπŸ“"""
17
  return re.sub(r'[^\w\-_\. ]', '_', title)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # 🎬 Animated Banner Messages 🎬
20
  def animated_banner(message, emoji):
21
  """Create an animated banner message. It's like a tiny parade for your console! πŸŽ‰πŸš©"""
 
152
  greatest_count, greatest_html = update_display("greatest")
153
  return top_count, top_html, new_count, new_html, greatest_count, greatest_html
154
 
155
+ # πŸš€ Asynchronous Web Page Downloading πŸš€
156
 
157
+ async def download_webpage(session, title, paper_info):
158
+ """Download the webpage content instead of the PDF. It's like browsing, but faster! πŸŒπŸ“„"""
159
+ link_url = paper_info['link']
160
+ if not link_url:
161
+ return f"🚫 No link for: {title}. It's playing hide and seek! πŸ™ˆ", None, None
162
 
163
  try:
164
  timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
165
+ async with session.get(link_url, timeout=timeout) as response:
166
  if response.status != 200:
167
+ return f"🚨 Failed to grab webpage for {title}: HTTP {response.status}. The internet gremlins strike again! πŸ‘Ή", None, None
168
+ page_content = await response.text()
 
 
 
 
169
 
170
+ # Combine the content as a Python type representation
171
+ code_block = f'"""\nTitle: {title}\nLink: {link_url}\n"""\n\n# Webpage Content\n{repr(page_content)}\n'
 
 
172
 
173
+ return f"πŸŽ‰ Successfully downloaded webpage for: {title}.", code_block, page_content
 
 
 
174
  except asyncio.TimeoutError:
175
+ return f"⏳ Timeout for {title}. The webpage is playing hard to get! πŸ’ƒ", None, None
176
  except Exception as e:
177
+ return f"πŸ’₯ Oops! Error downloading {title}: {str(e)}. Gremlins in the system! πŸ› οΈ", None, None
178
 
179
+ async def process_webpages(data, progress=gr.Progress()):
180
+ """Process multiple papers asynchronously by downloading their webpages. πŸ€Ήβ€β™‚οΈπŸŒ"""
181
  async with aiohttp.ClientSession() as session:
182
  tasks = []
183
  for title, paper_info in data.items():
184
+ task = asyncio.ensure_future(download_webpage(session, title, paper_info))
185
  tasks.append(task)
186
 
187
  results = []
188
+ codes = []
189
  for i, task in enumerate(asyncio.as_completed(tasks), start=1):
190
+ result, code_block, page_content = await task
191
  results.append(result)
192
+ if code_block:
193
+ codes.append(code_block)
194
  progress(i / len(tasks), f"πŸš€ Processed {i}/{len(tasks)} papers. Downloading...")
195
 
196
+ return results, codes
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
+ def download_all_webpages(progress=gr.Progress()):
199
+ """Download and display all paper webpages. It's like hosting a web party, and everyone's invited! πŸŽ‰πŸŒ"""
200
  all_data = {}
201
  for category in ["top", "latest", "greatest"]:
202
  cache_file = f"{category}_papers_cache.json"
 
204
  if data:
205
  all_data.update(data)
206
 
207
+ # Download the webpage content
208
+ results, code_blocks = asyncio.run(process_webpages(all_data, progress))
 
 
 
 
 
 
 
 
 
209
 
210
  summary = f"πŸ“Š Papers processed: {len(all_data)} (We're basically librarians now!)\n"
211
+ summary += f"βœ… Successfully downloaded: {len(code_blocks)} webpages\n"
212
+ summary += f"❌ Errors: {len(results) - len(code_blocks)} (Even superheroes have off days)\n\n"
 
213
 
214
+ return summary, "\n\n".join(code_blocks)
215
 
216
  # 🎭 Gradio Interface: Where the Magic Happens 🎭
217
 
 
236
  greatest_button = gr.Button("Refresh Leaderboard")
237
  greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
238
 
239
+ download_button = gr.Button("πŸ“š Download All Paper Webpages", variant="primary")
240
  download_output = gr.Textbox(label="Download Status")
241
+ code_output = gr.Code(label="Paper Webpage Contents", language="python")
242
+ download_button.click(fn=download_all_webpages, inputs=None, outputs=[download_output, code_output])
 
243
 
244
  # Load initial data for all tabs
245
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])