awacke1 commited on
Commit
6963a95
ยท
verified ยท
1 Parent(s): a1a6f6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -24
app.py CHANGED
@@ -10,28 +10,164 @@ import io
10
  import asyncio
11
  import aiohttp
12
  import aiofiles
13
- from concurrent.futures import ThreadPoolExecutor
14
  import re
15
  from datetime import datetime
16
  import base64
17
 
18
- # ... (keep all the previous functions up to create_date_directory)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  async def download_and_process_pdf(session, title, paper_info, directory):
 
21
  pdf_url = paper_info['pdf_link']
22
  if not pdf_url:
23
- return f"No PDF link available for: {title}", None, None
24
 
25
  try:
26
  timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
27
  async with session.get(pdf_url, timeout=timeout) as response:
28
  if response.status != 200:
29
- return f"Failed to download PDF for {title}: HTTP {response.status}", None, None
30
  pdf_content = await response.read()
31
 
32
  file_length = len(pdf_content)
33
  if file_length < 5000: # Check if the PDF is less than 5KB
34
- return f"Downloaded PDF for {title} is too small ({file_length} bytes). Skipping.", None, None
35
 
36
  # Convert PDF to text
37
  pdf_file = io.BytesIO(pdf_content)
@@ -41,7 +177,7 @@ async def download_and_process_pdf(session, title, paper_info, directory):
41
  text += page.extract_text()
42
 
43
  if len(text) < 5000: # Check if the extracted text is less than 5KB
44
- return f"Extracted text for {title} is too small ({len(text)} characters). Skipping.", None, None
45
 
46
  safe_title = safe_filename(title)
47
  txt_filename = f"{safe_title}.txt"
@@ -50,13 +186,14 @@ async def download_and_process_pdf(session, title, paper_info, directory):
50
  async with aiofiles.open(txt_filepath, 'w', encoding='utf-8') as f:
51
  await f.write(text)
52
 
53
- return f"Successfully processed: {txt_filename} (File length: {file_length} bytes)", txt_filepath, text
54
  except asyncio.TimeoutError:
55
- return f"Timeout while downloading PDF for {title}", None, None
56
  except Exception as e:
57
- return f"Error processing PDF for {title}: {str(e)}", None, None
58
 
59
  async def process_papers(data, directory, progress=gr.Progress()):
 
60
  async with aiohttp.ClientSession() as session:
61
  tasks = []
62
  for title, paper_info in data.items():
@@ -73,16 +210,19 @@ async def process_papers(data, directory, progress=gr.Progress()):
73
  successful_downloads.append((filepath, text))
74
  else:
75
  errors.append(result)
76
- progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
 
 
 
 
 
 
 
77
 
78
  return results, successful_downloads, errors
79
 
80
- def get_base64_download_link(content, filename):
81
- """Create a base64 download link for text content."""
82
- b64 = base64.b64encode(content.encode()).decode()
83
- return f'<a href="data:text/plain;base64,{b64}" download="{filename}">Download {filename}</a>'
84
-
85
  def download_all_papers(progress=gr.Progress()):
 
86
  all_data = {}
87
  for category in ["top", "latest", "greatest"]:
88
  cache_file = f"{category}_papers_cache.json"
@@ -93,20 +233,22 @@ def download_all_papers(progress=gr.Progress()):
93
  date_directory = create_date_directory()
94
  results, successful_downloads, errors = asyncio.run(process_papers(all_data, date_directory, progress))
95
 
96
- summary = f"Papers processed: {len(all_data)}\n"
97
- summary += f"Successfully downloaded and converted: {len(successful_downloads)}\n"
98
- summary += f"Errors: {len(errors)}\n\n"
99
- summary += "Error List:\n" + "\n".join(errors) if errors else "No errors occurred."
100
 
101
  download_links = []
102
  text_contents = []
103
  for filepath, text in successful_downloads:
104
  filename = os.path.basename(filepath)
105
  download_links.append(get_base64_download_link(text, filename))
106
- text_contents.append(f"--- {filename} ---\n\n{text[:1000]}...\n\n") # Show first 1000 characters
107
 
108
  return summary, "<br>".join(download_links), "\n".join(text_contents)
109
 
 
 
110
  with gr.Blocks() as demo:
111
  gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
112
 
@@ -135,7 +277,4 @@ with gr.Blocks() as demo:
135
  download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links, text_output])
136
 
137
  # Load initial data for all tabs
138
- demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
139
-
140
- # Launch the Gradio interface with a public link
141
- demo.launch(share=True)
 
10
  import asyncio
11
  import aiohttp
12
  import aiofiles
 
13
  import re
14
  from datetime import datetime
15
  import base64
16
 
17
+ # ๐Ÿง™โ€โ™‚๏ธ Magical Utility Functions ๐Ÿง™โ€โ™‚๏ธ
18
+
19
+ def safe_filename(title):
20
+ """Convert a string to a safe filename. No more 'file not found' nightmares! ๐Ÿ™…โ€โ™‚๏ธ๐Ÿ“"""
21
+ return re.sub(r'[^\w\-_\. ]', '_', title)
22
+
23
+ def create_date_directory():
24
+ """Create a directory named with the current date. It's like a time capsule for your downloads! ๐Ÿ—“๏ธ๐Ÿ“ฆ"""
25
+ date_str = datetime.now().strftime("%Y-%m-%d")
26
+ os.makedirs(date_str, exist_ok=True)
27
+ return date_str
28
+
29
+ def get_base64_download_link(content, filename):
30
+ """Create a base64 download link for text content. It's like teleportation for your files! ๐ŸŒŸ๐Ÿ“ฒ"""
31
+ b64 = base64.b64encode(content.encode()).decode()
32
+ return f'<a href="data:text/plain;base64,{b64}" download="{filename}">Download {filename}</a>'
33
+
34
+ # ๐ŸŽฌ Animated Banner Messages ๐ŸŽฌ
35
+ def animated_banner(message, emoji):
36
+ """Create an animated banner message. It's like a tiny parade for your console! ๐ŸŽ‰๐Ÿšฉ"""
37
+ frames = [
38
+ f"โ•”โ•โ•โ•โ• {emoji} โ•โ•โ•โ•โ•—\nโ•‘ {message:^16} โ•‘\nโ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•",
39
+ f"โ•”โ•โ•โ•โ• {emoji} โ•โ•โ•โ•โ•—\nโ•‘ {message:^16} โ•‘\nโ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•",
40
+ f"โ•”โ•โ•โ•โ•{emoji}โ•โ•โ•โ•โ•—\nโ•‘ {message:^14} โ•‘\nโ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•",
41
+ f"โ•”โ•โ•โ•{emoji}โ•โ•โ•โ•—\nโ•‘ {message:^12} โ•‘\nโ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•",
42
+ f"โ•”โ•โ•{emoji}โ•โ•โ•—\nโ•‘ {message:^10} โ•‘\nโ•šโ•โ•โ•โ•โ•โ•โ•",
43
+ f"โ•”โ•{emoji}โ•โ•—\nโ•‘ {message:^8} โ•‘\nโ•šโ•โ•โ•โ•โ•",
44
+ f"โ•”{emoji}โ•—\nโ•‘ {message:^6} โ•‘\nโ•šโ•โ•โ•",
45
+ ]
46
+ return frames
47
+
48
+ # ๐Ÿ•ต๏ธโ€โ™‚๏ธ Data Fetching and Caching Shenanigans ๐Ÿ•ต๏ธโ€โ™‚๏ธ
49
+
50
+ def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
51
+ """Fetch papers from the interwebs. It's like fishing, but for knowledge! ๐ŸŽฃ๐Ÿ“š"""
52
+ base_url = "https://paperswithcode.com"
53
+ session = requests.Session()
54
+ headers = {
55
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
56
+ 'Cache-Control': 'no-cache'
57
+ }
58
+ print("Time run at : ", time.ctime())
59
+ offset = 0
60
+ data_list = {}
61
+ break_duplicate = 10
62
+
63
+ while True:
64
+ response = session.get(url, headers=headers, params={'page': offset})
65
+ if response.status_code != 200:
66
+ print('Failed to retrieve data')
67
+ break
68
+ soup = BeautifulSoup(response.text, 'html.parser')
69
+ paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
70
+ if not paper_info:
71
+ break
72
+ for ppr in paper_info:
73
+ title = ppr.find('h1').text.strip()
74
+
75
+ if "paper" in ppr.find('a')['href']:
76
+ link = base_url + ppr.find('a')['href']
77
+ else:
78
+ link = ppr.find('a')['href']
79
+ Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
80
+ pdf_link = ''
81
+ try:
82
+ response_link = session.get(link, headers=headers)
83
+ soup_link = BeautifulSoup(response_link.text, 'html.parser')
84
+ paper_info_link = soup_link.find_all('div', class_='paper-abstract')
85
+ pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
86
+ except:
87
+ pass
88
+ if title not in data_list:
89
+ data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
90
+ else:
91
+ break_duplicate -= 1
92
+ if break_duplicate == 0:
93
+ return data_list
94
+ offset += 1
95
+ progress.update(offset)
96
+ print('Data retrieval complete')
97
+ return data_list
98
+
99
+ def load_cached_data(cache_file):
100
+ """Load cached data. It's like finding money in your old jeans! ๐Ÿ’ฐ๐Ÿงต"""
101
+ if os.path.exists(cache_file):
102
+ with open(cache_file, 'r') as f:
103
+ return json.load(f)
104
+ return None
105
+
106
+ def save_cached_data(data, cache_file):
107
+ """Save data to cache. Future you will thank present you! ๐Ÿฆธโ€โ™‚๏ธ๐Ÿ•ฐ๏ธ"""
108
+ with open(cache_file, 'w') as f:
109
+ json.dump(data, f)
110
+
111
+ def load_and_cache_data(url, cache_file):
112
+ """Load data from cache or fetch new data. It's like a time machine for your data! โฐ๐Ÿ”„"""
113
+ cached_data = load_cached_data(cache_file)
114
+
115
+ if cached_data:
116
+ print(f"Loading cached data from {cache_file}")
117
+ return cached_data
118
+
119
+ print(f"Fetching new data from {url}")
120
+ new_data = get_rank_papers(url)
121
+ save_cached_data(new_data, cache_file)
122
+ return new_data
123
+
124
+ # ๐Ÿ“Š Data Processing and Display Magic ๐Ÿ“Š
125
+
126
+ def format_dataframe(data):
127
+ """Format data into a pretty DataFrame. It's like giving your data a makeover! ๐Ÿ’…๐Ÿ“ˆ"""
128
+ df = pd.DataFrame(data).T
129
+ df['title'] = df.index
130
+ df = df[['title', 'Github Star', 'link', 'pdf_link']]
131
+ df = df.sort_values(by='Github Star', ascending=False)
132
+ df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
133
+ df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
134
+ return df
135
+
136
+ def update_display(category):
137
+ """Update the display for a category. Freshen up your data like it's spring cleaning! ๐Ÿงน๐ŸŒธ"""
138
+ cache_file = f"{category}_papers_cache.json"
139
+ url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
140
+
141
+ data = load_and_cache_data(url, cache_file)
142
+ df = format_dataframe(data)
143
+
144
+ return len(df), df.to_html(escape=False, index=False)
145
+
146
+ def load_all_data():
147
+ """Load data for all categories. It's like a buffet for your brain! ๐Ÿง ๐Ÿฝ๏ธ"""
148
+ top_count, top_html = update_display("top")
149
+ new_count, new_html = update_display("latest")
150
+ greatest_count, greatest_html = update_display("greatest")
151
+ return top_count, top_html, new_count, new_html, greatest_count, greatest_html
152
+
153
+ # ๐Ÿš€ Asynchronous Paper Processing Wizardry ๐Ÿš€
154
 
155
  async def download_and_process_pdf(session, title, paper_info, directory):
156
+ """Download and process a PDF. It's like turning lead into gold, but with papers! ๐Ÿ“œโœจ"""
157
  pdf_url = paper_info['pdf_link']
158
  if not pdf_url:
159
+ return f"๐Ÿšซ No PDF link for: {title}. It's playing hide and seek! ๐Ÿ™ˆ", None, None
160
 
161
  try:
162
  timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
163
  async with session.get(pdf_url, timeout=timeout) as response:
164
  if response.status != 200:
165
+ return f"๐Ÿšจ Failed to grab PDF for {title}: HTTP {response.status}. The internet gremlins strike again! ๐Ÿ‘น", None, None
166
  pdf_content = await response.read()
167
 
168
  file_length = len(pdf_content)
169
  if file_length < 5000: # Check if the PDF is less than 5KB
170
+ return f"๐Ÿœ PDF for {title} is tiny ({file_length} bytes). It's like a paper for ants! ๐Ÿœ๐Ÿ“„", None, None
171
 
172
  # Convert PDF to text
173
  pdf_file = io.BytesIO(pdf_content)
 
177
  text += page.extract_text()
178
 
179
  if len(text) < 5000: # Check if the extracted text is less than 5KB
180
+ return f"๐Ÿ“‰ Extracted text for {title} is too small ({len(text)} characters). It's not you, it's the PDF! ๐Ÿ’”", None, None
181
 
182
  safe_title = safe_filename(title)
183
  txt_filename = f"{safe_title}.txt"
 
186
  async with aiofiles.open(txt_filepath, 'w', encoding='utf-8') as f:
187
  await f.write(text)
188
 
189
+ return f"๐ŸŽ‰ Successfully processed: {txt_filename} (File length: {file_length} bytes). It's alive! ๐Ÿงฌ", txt_filepath, text
190
  except asyncio.TimeoutError:
191
+ return f"โณ Timeout for {title}. The PDF is playing hard to get! ๐Ÿ’ƒ", None, None
192
  except Exception as e:
193
+ return f"๐Ÿ’ฅ Oops! Error processing {title}: {str(e)}. Gremlins in the system! ๐Ÿ› ๏ธ", None, None
194
 
195
  async def process_papers(data, directory, progress=gr.Progress()):
196
+ """Process multiple papers asynchronously. It's like juggling papers, but faster! ๐Ÿคนโ€โ™‚๏ธ๐Ÿ“š"""
197
  async with aiohttp.ClientSession() as session:
198
  tasks = []
199
  for title, paper_info in data.items():
 
210
  successful_downloads.append((filepath, text))
211
  else:
212
  errors.append(result)
213
+ progress(i / len(tasks), f"๐Ÿš€ Processed {i}/{len(tasks)} papers. Science waits for no one!")
214
+
215
+ # Display animated banner
216
+ banner_frames = animated_banner("Processing", "๐Ÿ“„")
217
+ for frame in banner_frames:
218
+ print(frame, end='\r')
219
+ await asyncio.sleep(0.1)
220
+ print(" " * len(banner_frames[-1]), end='\r') # Clear the last frame
221
 
222
  return results, successful_downloads, errors
223
 
 
 
 
 
 
224
  def download_all_papers(progress=gr.Progress()):
225
+ """Download and process all papers. It's like hosting a paper party, and everyone's invited! ๐ŸŽ‰๐Ÿ“š"""
226
  all_data = {}
227
  for category in ["top", "latest", "greatest"]:
228
  cache_file = f"{category}_papers_cache.json"
 
233
  date_directory = create_date_directory()
234
  results, successful_downloads, errors = asyncio.run(process_papers(all_data, date_directory, progress))
235
 
236
+ summary = f"๐Ÿ“Š Papers processed: {len(all_data)} (We're basically librarians now!)\n"
237
+ summary += f"โœ… Successfully downloaded and converted: {len(successful_downloads)} (Take that, PDF gremlins!)\n"
238
+ summary += f"โŒ Errors: {len(errors)} (Even superheroes have off days)\n\n"
239
+ summary += "๐Ÿšจ Error List (AKA 'The Wall of Shame'):\n" + "\n".join(errors) if errors else "No errors occurred. It's a miracle! ๐Ÿ™Œ"
240
 
241
  download_links = []
242
  text_contents = []
243
  for filepath, text in successful_downloads:
244
  filename = os.path.basename(filepath)
245
  download_links.append(get_base64_download_link(text, filename))
246
+ text_contents.append(f"--- {filename} ---\n\n{text[:1000]}... (There's more, but we don't want to spoil the ending! ๐Ÿ“š๐Ÿ”ฎ)\n\n")
247
 
248
  return summary, "<br>".join(download_links), "\n".join(text_contents)
249
 
250
+ # ๐ŸŽญ Gradio Interface: Where the Magic Happens ๐ŸŽญ
251
+
252
  with gr.Blocks() as demo:
253
  gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
254
 
 
277
  download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links, text_output])
278
 
279
  # Load initial data for all tabs
280
+ demo.load(fn=load_all_data, outputs=[