Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -122,14 +122,17 @@ def create_date_directory():
|
|
122 |
async def download_and_save_pdf(session, title, paper_info, directory):
|
123 |
pdf_url = paper_info['pdf_link']
|
124 |
if not pdf_url:
|
125 |
-
return f"No PDF link available for: {title}"
|
126 |
|
127 |
try:
|
128 |
-
|
|
|
|
|
|
|
129 |
pdf_content = await response.read()
|
130 |
|
131 |
if len(pdf_content) < 2048: # Check if the PDF is less than 2KB
|
132 |
-
return f"Downloaded PDF for {title} is too small (less than 2KB). Skipping."
|
133 |
|
134 |
safe_title = safe_filename(title)
|
135 |
filename = f"{safe_title}.pdf"
|
@@ -138,9 +141,11 @@ async def download_and_save_pdf(session, title, paper_info, directory):
|
|
138 |
async with aiofiles.open(filepath, 'wb') as f:
|
139 |
await f.write(pdf_content)
|
140 |
|
141 |
-
return f"Successfully saved: {filename}"
|
|
|
|
|
142 |
except Exception as e:
|
143 |
-
return f"Error saving PDF for {title}: {str(e)}"
|
144 |
|
145 |
async def process_papers(data, directory, progress=gr.Progress()):
|
146 |
async with aiohttp.ClientSession() as session:
|
@@ -150,24 +155,25 @@ async def process_papers(data, directory, progress=gr.Progress()):
|
|
150 |
tasks.append(task)
|
151 |
|
152 |
results = []
|
|
|
|
|
153 |
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
154 |
-
result = await task
|
155 |
results.append(result)
|
|
|
|
|
|
|
|
|
156 |
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
157 |
|
158 |
-
return
|
159 |
|
160 |
-
def zip_directory(directory):
|
161 |
-
"""Zip the
|
162 |
zip_filename = f"{directory}.zip"
|
163 |
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
164 |
-
for
|
165 |
-
|
166 |
-
file_path = os.path.join(root, file)
|
167 |
-
if os.path.getsize(file_path) >= 2048: # Only include files 2KB or larger
|
168 |
-
zipf.write(file_path,
|
169 |
-
os.path.relpath(file_path,
|
170 |
-
os.path.join(directory, '..')))
|
171 |
return zip_filename
|
172 |
|
173 |
def get_base64_download_link(file_path):
|
@@ -194,17 +200,22 @@ def download_all_papers(progress=gr.Progress()):
|
|
194 |
all_data.update(data)
|
195 |
|
196 |
date_directory = create_date_directory()
|
197 |
-
results = asyncio.run(process_papers(all_data, date_directory, progress))
|
198 |
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
201 |
|
202 |
existing_links = get_existing_zip_links()
|
203 |
|
204 |
-
|
205 |
-
|
|
|
|
|
206 |
|
207 |
-
return
|
208 |
|
209 |
with gr.Blocks() as demo:
|
210 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
|
|
122 |
async def download_and_save_pdf(session, title, paper_info, directory):
|
123 |
pdf_url = paper_info['pdf_link']
|
124 |
if not pdf_url:
|
125 |
+
return f"No PDF link available for: {title}", None
|
126 |
|
127 |
try:
|
128 |
+
timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
|
129 |
+
async with session.get(pdf_url, timeout=timeout) as response:
|
130 |
+
if response.status != 200:
|
131 |
+
return f"Failed to download PDF for {title}: HTTP {response.status}", None
|
132 |
pdf_content = await response.read()
|
133 |
|
134 |
if len(pdf_content) < 2048: # Check if the PDF is less than 2KB
|
135 |
+
return f"Downloaded PDF for {title} is too small (less than 2KB). Skipping.", None
|
136 |
|
137 |
safe_title = safe_filename(title)
|
138 |
filename = f"{safe_title}.pdf"
|
|
|
141 |
async with aiofiles.open(filepath, 'wb') as f:
|
142 |
await f.write(pdf_content)
|
143 |
|
144 |
+
return f"Successfully saved: {filename}", filepath
|
145 |
+
except asyncio.TimeoutError:
|
146 |
+
return f"Timeout while downloading PDF for {title}", None
|
147 |
except Exception as e:
|
148 |
+
return f"Error saving PDF for {title}: {str(e)}", None
|
149 |
|
150 |
async def process_papers(data, directory, progress=gr.Progress()):
|
151 |
async with aiohttp.ClientSession() as session:
|
|
|
155 |
tasks.append(task)
|
156 |
|
157 |
results = []
|
158 |
+
successful_downloads = []
|
159 |
+
errors = []
|
160 |
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
161 |
+
result, filepath = await task
|
162 |
results.append(result)
|
163 |
+
if filepath:
|
164 |
+
successful_downloads.append(filepath)
|
165 |
+
else:
|
166 |
+
errors.append(result)
|
167 |
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
168 |
|
169 |
+
return results, successful_downloads, errors
|
170 |
|
171 |
+
def zip_directory(files_to_zip, directory):
|
172 |
+
"""Zip the specified files."""
|
173 |
zip_filename = f"{directory}.zip"
|
174 |
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
175 |
+
for file in files_to_zip:
|
176 |
+
zipf.write(file, os.path.relpath(file, os.path.join(directory, '..')))
|
|
|
|
|
|
|
|
|
|
|
177 |
return zip_filename
|
178 |
|
179 |
def get_base64_download_link(file_path):
|
|
|
200 |
all_data.update(data)
|
201 |
|
202 |
date_directory = create_date_directory()
|
203 |
+
results, successful_downloads, errors = asyncio.run(process_papers(all_data, date_directory, progress))
|
204 |
|
205 |
+
if successful_downloads:
|
206 |
+
zip_file = zip_directory(successful_downloads, date_directory)
|
207 |
+
download_link = get_base64_download_link(zip_file)
|
208 |
+
else:
|
209 |
+
download_link = "No papers were successfully downloaded."
|
210 |
|
211 |
existing_links = get_existing_zip_links()
|
212 |
|
213 |
+
summary = f"Papers processed: {len(all_data)}\n"
|
214 |
+
summary += f"Successfully downloaded: {len(successful_downloads)}\n"
|
215 |
+
summary += f"Errors: {len(errors)}\n\n"
|
216 |
+
summary += "Error List:\n" + "\n".join(errors) if errors else "No errors occurred."
|
217 |
|
218 |
+
return summary, f"{download_link}<br><br>Previous downloads:<br>{existing_links}"
|
219 |
|
220 |
with gr.Blocks() as demo:
|
221 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|