awacke1 commited on
Commit
09acd5a
·
verified ·
1 Parent(s): 0058d22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -32
app.py CHANGED
@@ -11,6 +11,10 @@ import asyncio
11
  import aiohttp
12
  import aiofiles
13
  from concurrent.futures import ThreadPoolExecutor
 
 
 
 
14
 
15
  def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
16
  base_url = "https://paperswithcode.com"
@@ -105,51 +109,76 @@ def load_all_data():
105
  greatest_count, greatest_html = update_display("greatest")
106
  return top_count, top_html, new_count, new_html, greatest_count, greatest_html
107
 
108
- async def download_and_convert_pdf(session, title, paper_info):
 
 
 
 
 
 
 
 
 
 
109
  pdf_url = paper_info['pdf_link']
110
- cache_file = f"cache/{title.replace(' ', '_')}.md"
111
-
112
- if os.path.exists(cache_file):
113
- async with aiofiles.open(cache_file, 'r') as f:
114
- return await f.read()
115
-
116
  if not pdf_url:
117
- return f"# {title}\n\nNo PDF link available.\n\n---\n\n"
118
 
119
  try:
120
  async with session.get(pdf_url) as response:
121
  pdf_content = await response.read()
122
 
123
- pdf_file = io.BytesIO(pdf_content)
124
- pdf_reader = PyPDF2.PdfReader(pdf_file)
125
- text = ""
126
- for page in pdf_reader.pages:
127
- text += page.extract_text()
128
 
129
- markdown_text = f"# {title}\n\n{text}\n\n---\n\n"
 
130
 
131
- os.makedirs('cache', exist_ok=True)
132
- async with aiofiles.open(cache_file, 'w') as f:
133
- await f.write(markdown_text)
134
-
135
- return markdown_text
136
  except Exception as e:
137
- return f"# {title}\n\nError processing PDF: {str(e)}\n\n---\n\n"
138
 
139
- async def process_papers(data, progress=gr.Progress()):
140
  async with aiohttp.ClientSession() as session:
141
  tasks = []
142
  for title, paper_info in data.items():
143
- task = asyncio.ensure_future(download_and_convert_pdf(session, title, paper_info))
144
  tasks.append(task)
145
 
146
- consolidated_text = ""
147
  for i, task in enumerate(asyncio.as_completed(tasks), start=1):
148
- markdown_text = await task
149
- consolidated_text += markdown_text
150
  progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
151
 
152
- return consolidated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  def download_all_papers(progress=gr.Progress()):
155
  all_data = {}
@@ -159,12 +188,15 @@ def download_all_papers(progress=gr.Progress()):
159
  if data:
160
  all_data.update(data)
161
 
162
- consolidated_text = asyncio.run(process_papers(all_data, progress))
 
 
 
 
163
 
164
- with open("consolidated_papers.md", "w", encoding="utf-8") as f:
165
- f.write(consolidated_text)
166
 
167
- return "All papers have been downloaded and consolidated into 'consolidated_papers.md'", consolidated_text
168
 
169
  with gr.Blocks() as demo:
170
  gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
@@ -189,8 +221,8 @@ with gr.Blocks() as demo:
189
 
190
  download_button = gr.Button("📚 Download All Papers", variant="primary")
191
  download_output = gr.Textbox(label="Download Status")
192
- markdown_output = gr.Markdown(label="Paper Content")
193
- download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, markdown_output])
194
 
195
  # Load initial data for all tabs
196
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
 
11
  import aiohttp
12
  import aiofiles
13
  from concurrent.futures import ThreadPoolExecutor
14
+ import re
15
+ from datetime import datetime
16
+ import zipfile
17
+ import base64
18
 
19
  def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
20
  base_url = "https://paperswithcode.com"
 
109
  greatest_count, greatest_html = update_display("greatest")
110
  return top_count, top_html, new_count, new_html, greatest_count, greatest_html
111
 
112
+ def safe_filename(title):
113
+ """Convert a string to a safe filename."""
114
+ return re.sub(r'[^\w\-_\. ]', '_', title)
115
+
116
+ def create_date_directory():
117
+ """Create a directory named with the current date."""
118
+ date_str = datetime.now().strftime("%Y-%m-%d")
119
+ os.makedirs(date_str, exist_ok=True)
120
+ return date_str
121
+
122
+ async def download_and_save_pdf(session, title, paper_info, directory):
123
  pdf_url = paper_info['pdf_link']
 
 
 
 
 
 
124
  if not pdf_url:
125
+ return f"No PDF link available for: {title}"
126
 
127
  try:
128
  async with session.get(pdf_url) as response:
129
  pdf_content = await response.read()
130
 
131
+ safe_title = safe_filename(title)
132
+ filename = f"{safe_title}.pdf"
133
+ filepath = os.path.join(directory, filename)
 
 
134
 
135
+ async with aiofiles.open(filepath, 'wb') as f:
136
+ await f.write(pdf_content)
137
 
138
+ return f"Successfully saved: {filename}"
 
 
 
 
139
  except Exception as e:
140
+ return f"Error saving PDF for {title}: {str(e)}"
141
 
142
+ async def process_papers(data, directory, progress=gr.Progress()):
143
  async with aiohttp.ClientSession() as session:
144
  tasks = []
145
  for title, paper_info in data.items():
146
+ task = asyncio.ensure_future(download_and_save_pdf(session, title, paper_info, directory))
147
  tasks.append(task)
148
 
149
+ results = []
150
  for i, task in enumerate(asyncio.as_completed(tasks), start=1):
151
+ result = await task
152
+ results.append(result)
153
  progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
154
 
155
+ return "\n".join(results)
156
+
157
+ def zip_directory(directory):
158
+ """Zip the entire directory."""
159
+ zip_filename = f"{directory}.zip"
160
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
161
+ for root, _, files in os.walk(directory):
162
+ for file in files:
163
+ zipf.write(os.path.join(root, file),
164
+ os.path.relpath(os.path.join(root, file),
165
+ os.path.join(directory, '..')))
166
+ return zip_filename
167
+
168
+ def get_base64_download_link(file_path):
169
+ """Create a base64 download link for a file."""
170
+ with open(file_path, "rb") as file:
171
+ content = file.read()
172
+ b64 = base64.b64encode(content).decode()
173
+ return f'<a href="data:application/zip;base64,{b64}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
174
+
175
+ def get_existing_zip_links():
176
+ """Get download links for existing zip files."""
177
+ links = []
178
+ for file in os.listdir('.'):
179
+ if file.endswith('.zip') and os.path.isfile(file):
180
+ links.append(get_base64_download_link(file))
181
+ return "<br>".join(links)
182
 
183
  def download_all_papers(progress=gr.Progress()):
184
  all_data = {}
 
188
  if data:
189
  all_data.update(data)
190
 
191
+ date_directory = create_date_directory()
192
+ results = asyncio.run(process_papers(all_data, date_directory, progress))
193
+
194
+ zip_file = zip_directory(date_directory)
195
+ download_link = get_base64_download_link(zip_file)
196
 
197
+ existing_links = get_existing_zip_links()
 
198
 
199
+ return f"All papers have been downloaded and saved in {zip_file}\n\n{results}", f"{download_link}<br><br>Previous downloads:<br>{existing_links}"
200
 
201
  with gr.Blocks() as demo:
202
  gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
 
221
 
222
  download_button = gr.Button("📚 Download All Papers", variant="primary")
223
  download_output = gr.Textbox(label="Download Status")
224
+ download_links = gr.HTML(label="Download Links")
225
+ download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links])
226
 
227
  # Load initial data for all tabs
228
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])