awacke1 commited on
Commit
36e61fc
Β·
verified Β·
1 Parent(s): 60d9046

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -125
app.py CHANGED
@@ -5,10 +5,13 @@ import gradio as gr
5
  import time
6
  import os
7
  import json
8
- import aiohttp
9
- import aiofiles
10
  import re
11
  from datetime import datetime
 
 
 
 
 
12
 
13
  # πŸ§™β€β™‚οΈ Magical Utility Functions πŸ§™β€β™‚οΈ
14
 
@@ -16,20 +19,6 @@ def safe_filename(title):
16
  """Convert a string to a safe filename. No more 'file not found' nightmares! πŸ™…β€β™‚οΈπŸ“"""
17
  return re.sub(r'[^\w\-_\. ]', '_', title)
18
 
19
- # 🎬 Animated Banner Messages 🎬
20
- def animated_banner(message, emoji):
21
- """Create an animated banner message. It's like a tiny parade for your console! πŸŽ‰πŸš©"""
22
- frames = [
23
- f"╔════ {emoji} ════╗\nβ•‘ {message:^16} β•‘\nβ•šβ•β•β•β•β•β•β•β•β•β•β•β•β•",
24
- f"╔════ {emoji} ════╗\nβ•‘ {message:^16} β•‘\nβ•šβ•β•β•β•β•β•β•β•β•β•β•β•β•",
25
- f"╔════{emoji}════╗\nβ•‘ {message:^14} β•‘\nβ•šβ•β•β•β•β•β•β•β•β•β•β•",
26
- f"╔═══{emoji}═══╗\nβ•‘ {message:^12} β•‘\nβ•šβ•β•β•β•β•β•β•β•β•",
27
- f"╔══{emoji}══╗\nβ•‘ {message:^10} β•‘\nβ•šβ•β•β•β•β•β•β•",
28
- f"╔═{emoji}═╗\nβ•‘ {message:^8} β•‘\nβ•šβ•β•β•β•β•",
29
- f"β•”{emoji}β•—\nβ•‘ {message:^6} β•‘\nβ•šβ•β•β•",
30
- ]
31
- return frames
32
-
33
  # πŸ•΅οΈβ€β™‚οΈ Data Fetching and Caching Shenanigans πŸ•΅οΈβ€β™‚οΈ
34
 
35
  def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
@@ -65,19 +54,8 @@ def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
65
 
66
  Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '') if ppr.find('span', class_='badge badge-secondary') else "0"
67
 
68
- pdf_link = ''
69
- try:
70
- response_link = session.get(link, headers=headers)
71
- soup_link = BeautifulSoup(response_link.text, 'html.parser')
72
- paper_info_link = soup_link.find_all('div', class_='paper-abstract')
73
- pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
74
- except Exception as e:
75
- print(f"Failed to retrieve PDF link for {title}: {e}")
76
-
77
- print(f"Title: {title}, Link: {link}, Github Star: {Github_Star}, PDF Link: {pdf_link}")
78
-
79
  if title not in data_list:
80
- data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
81
  else:
82
  break_duplicate -= 1
83
  if break_duplicate == 0:
@@ -112,91 +90,53 @@ def load_and_cache_data(url, cache_file):
112
  save_cached_data(new_data, cache_file)
113
  return new_data
114
 
115
- # πŸ“Š Data Processing and Display Magic πŸ“Š
116
-
117
- def format_dataframe(data):
118
- """Format data into a pretty DataFrame. It's like giving your data a makeover! πŸ’…πŸ“ˆ"""
119
- if not data:
120
- print("No data found to format.")
121
- return pd.DataFrame()
122
-
123
- df = pd.DataFrame(data).T
124
- df['title'] = df.index
125
-
126
- # Check if required columns are present
127
- if 'Github Star' in df.columns and 'link' in df.columns and 'pdf_link' in df.columns:
128
- df = df[['title', 'Github Star', 'link', 'pdf_link']]
129
- df = df.sort_values(by='Github Star', ascending=False)
130
- df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
131
- df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
132
- else:
133
- print("Required columns are missing in the dataframe.")
134
- print(f"Columns available: {df.columns}")
135
-
136
- return df
137
-
138
- def update_display(category):
139
- """Update the display for a category. Freshen up your data like it's spring cleaning! 🧹🌸"""
140
- cache_file = f"{category}_papers_cache.json"
141
- url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
142
-
143
- data = load_and_cache_data(url, cache_file)
144
- df = format_dataframe(data)
145
-
146
- return len(df), df.to_html(escape=False, index=False)
147
-
148
- def load_all_data():
149
- """Load data for all categories. It's like a buffet for your brain! 🧠🍽️"""
150
- top_count, top_html = update_display("top")
151
- new_count, new_html = update_display("latest")
152
- greatest_count, greatest_html = update_display("greatest")
153
- return top_count, top_html, new_count, new_html, greatest_count, greatest_html
154
-
155
- # πŸš€ Asynchronous Web Page Downloading πŸš€
156
-
157
- async def download_webpage(session, title, paper_info):
158
- """Download the webpage content instead of the PDF. It's like browsing, but faster! πŸŒπŸ“„"""
159
- link_url = paper_info['link']
160
- if not link_url:
161
- return f"🚫 No link for: {title}. It's playing hide and seek! πŸ™ˆ", None, None
162
-
163
- try:
164
- timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
165
- async with session.get(link_url, timeout=timeout) as response:
166
- if response.status != 200:
167
- return f"🚨 Failed to grab webpage for {title}: HTTP {response.status}. The internet gremlins strike again! πŸ‘Ή", None, None
168
- page_content = await response.text()
169
-
170
- # Combine the content as a Python type representation
171
- code_block = f'"""\nTitle: {title}\nLink: {link_url}\n"""\n\n# Webpage Content\n{repr(page_content)}\n'
172
-
173
- return f"πŸŽ‰ Successfully downloaded webpage for: {title}.", code_block, page_content
174
- except asyncio.TimeoutError:
175
- return f"⏳ Timeout for {title}. The webpage is playing hard to get! πŸ’ƒ", None, None
176
- except Exception as e:
177
- return f"πŸ’₯ Oops! Error downloading {title}: {str(e)}. Gremlins in the system! πŸ› οΈ", None, None
178
-
179
- async def process_webpages(data, progress=gr.Progress()):
180
- """Process multiple papers asynchronously by downloading their webpages. πŸ€Ήβ€β™‚οΈπŸŒ"""
181
- async with aiohttp.ClientSession() as session:
182
- tasks = []
183
- for title, paper_info in data.items():
184
- task = asyncio.ensure_future(download_webpage(session, title, paper_info))
185
- tasks.append(task)
186
-
187
- results = []
188
- codes = []
189
- for i, task in enumerate(asyncio.as_completed(tasks), start=1):
190
- result, code_block, page_content = await task
191
- results.append(result)
192
- if code_block:
193
- codes.append(code_block)
194
- progress(i / len(tasks), f"πŸš€ Processed {i}/{len(tasks)} papers. Downloading...")
195
-
196
- return results, codes
197
-
198
- def download_all_webpages(progress=gr.Progress()):
199
- """Download and display all paper webpages. It's like hosting a web party, and everyone's invited! πŸŽ‰πŸŒ"""
200
  all_data = {}
201
  for category in ["top", "latest", "greatest"]:
202
  cache_file = f"{category}_papers_cache.json"
@@ -204,19 +144,26 @@ def download_all_webpages(progress=gr.Progress()):
204
  if data:
205
  all_data.update(data)
206
 
207
- # Download the webpage content
208
- results, code_blocks = asyncio.run(process_webpages(all_data, progress))
209
 
210
- summary = f"πŸ“Š Papers processed: {len(all_data)} (We're basically librarians now!)\n"
211
- summary += f"βœ… Successfully downloaded: {len(code_blocks)} webpages\n"
212
- summary += f"❌ Errors: {len(results) - len(code_blocks)} (Even superheroes have off days)\n\n"
213
 
214
- return summary, "\n\n".join(code_blocks)
 
 
 
 
 
 
 
 
 
215
 
216
  # 🎭 Gradio Interface: Where the Magic Happens 🎭
217
 
218
  with gr.Blocks() as demo:
219
- gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
220
 
221
  with gr.Tab("Top Trending Papers"):
222
  top_count = gr.Textbox(label="Number of Papers Fetched")
@@ -236,14 +183,15 @@ with gr.Blocks() as demo:
236
  greatest_button = gr.Button("Refresh Leaderboard")
237
  greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
238
 
239
- download_button = gr.Button("πŸ“š Download All Paper Webpages", variant="primary")
240
- download_output = gr.Textbox(label="Download Status")
241
- code_output = gr.Code(label="Paper Webpage Contents", language="python")
242
- download_button.click(fn=download_all_webpages, inputs=None, outputs=[download_output, code_output])
 
243
 
244
  # Load initial data for all tabs
245
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
246
 
247
  # πŸš€ Launch the Gradio interface with a public link
248
- print("🎭 Launching the Papers Leaderboard! Get ready for a wild ride through the land of academia! πŸŽ’πŸ“š")
249
  demo.launch(share=True)
 
5
  import time
6
  import os
7
  import json
 
 
8
  import re
9
  from datetime import datetime
10
+ import torch
11
+ from transformers import AutoTokenizer, AutoModel
12
+ import networkx as nx
13
+ from pyvis.network import Network
14
+ import matplotlib.pyplot as plt
15
 
16
  # πŸ§™β€β™‚οΈ Magical Utility Functions πŸ§™β€β™‚οΈ
17
 
 
19
  """Convert a string to a safe filename. No more 'file not found' nightmares! πŸ™…β€β™‚οΈπŸ“"""
20
  return re.sub(r'[^\w\-_\. ]', '_', title)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # πŸ•΅οΈβ€β™‚οΈ Data Fetching and Caching Shenanigans πŸ•΅οΈβ€β™‚οΈ
23
 
24
  def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
 
54
 
55
  Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '') if ppr.find('span', class_='badge badge-secondary') else "0"
56
 
 
 
 
 
 
 
 
 
 
 
 
57
  if title not in data_list:
58
+ data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'title': title}
59
  else:
60
  break_duplicate -= 1
61
  if break_duplicate == 0:
 
90
  save_cached_data(new_data, cache_file)
91
  return new_data
92
 
93
+ # πŸš€ Transformer-based Word and Context Analysis πŸš€
94
+
95
+ def generate_embeddings(titles):
96
+ """Generate word embeddings using a transformer model."""
97
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
98
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
99
+ model = AutoModel.from_pretrained(model_name)
100
+
101
+ embeddings = []
102
+ with torch.no_grad():
103
+ for title in titles:
104
+ tokens = tokenizer(title, return_tensors="pt", padding=True, truncation=True)
105
+ output = model(**tokens)
106
+ embeddings.append(output.last_hidden_state.mean(dim=1).squeeze())
107
+
108
+ return embeddings
109
+
110
+ def build_graph(titles, embeddings, threshold=0.7):
111
+ """Build a graph of words based on similarity between titles."""
112
+ G = nx.Graph()
113
+
114
+ for i, title in enumerate(titles):
115
+ G.add_node(i, label=title)
116
+
117
+ for i in range(len(embeddings)):
118
+ for j in range(i+1, len(embeddings)):
119
+ sim = torch.cosine_similarity(embeddings[i], embeddings[j], dim=0).item()
120
+ if sim > threshold:
121
+ G.add_edge(i, j, weight=sim)
122
+
123
+ return G
124
+
125
+ def visualize_graph(G, titles):
126
+ """Visualize the graph using pyvis and show it as a mind map."""
127
+ net = Network(height="750px", width="100%", notebook=True)
128
+
129
+ for node in G.nodes(data=True):
130
+ net.add_node(node[0], label=titles[node[0]])
131
+
132
+ for edge in G.edges(data=True):
133
+ net.add_edge(edge[0], edge[1], value=edge[2]['weight'])
134
+
135
+ net.show("paper_network.html")
136
+ return "paper_network.html"
137
+
138
+ def analyze_and_generate_graph(progress=gr.Progress()):
139
+ """Analyze papers, generate embeddings, and visualize the relationship graph."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  all_data = {}
141
  for category in ["top", "latest", "greatest"]:
142
  cache_file = f"{category}_papers_cache.json"
 
144
  if data:
145
  all_data.update(data)
146
 
147
+ titles = [paper['title'] for paper in all_data.values()]
 
148
 
149
+ # Generate embeddings
150
+ embeddings = generate_embeddings(titles)
 
151
 
152
+ # Build a similarity graph based on the embeddings
153
+ G = build_graph(titles, embeddings)
154
+
155
+ # Visualize the graph as a mind map
156
+ graph_file = visualize_graph(G, titles)
157
+
158
+ summary = f"πŸ“Š Papers analyzed: {len(titles)}\n"
159
+ summary += f"βœ… Graph generated and visualized.\n"
160
+
161
+ return summary, graph_file
162
 
163
  # 🎭 Gradio Interface: Where the Magic Happens 🎭
164
 
165
  with gr.Blocks() as demo:
166
+ gr.Markdown("<h1><center>Papers Leaderboard with Context Analysis</center></h1>")
167
 
168
  with gr.Tab("Top Trending Papers"):
169
  top_count = gr.Textbox(label="Number of Papers Fetched")
 
183
  greatest_button = gr.Button("Refresh Leaderboard")
184
  greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
185
 
186
+ analyze_button = gr.Button("πŸ” Analyze and Generate Graph", variant="primary")
187
+ analyze_output = gr.Textbox(label="Analysis Status")
188
+ graph_output = gr.HTML(label="Graph Visualization")
189
+
190
+ analyze_button.click(fn=analyze_and_generate_graph, inputs=None, outputs=[analyze_output, graph_output])
191
 
192
  # Load initial data for all tabs
193
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
194
 
195
  # πŸš€ Launch the Gradio interface with a public link
196
+ print("🎭 Launching the Papers Leaderboard with Context Analysis! Get ready to explore the relationships between papers! πŸŽ’πŸ“š")
197
  demo.launch(share=True)