Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,10 +5,13 @@ import gradio as gr
|
|
5 |
import time
|
6 |
import os
|
7 |
import json
|
8 |
-
import aiohttp
|
9 |
-
import aiofiles
|
10 |
import re
|
11 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# π§ββοΈ Magical Utility Functions π§ββοΈ
|
14 |
|
@@ -16,20 +19,6 @@ def safe_filename(title):
|
|
16 |
"""Convert a string to a safe filename. No more 'file not found' nightmares! π
ββοΈπ"""
|
17 |
return re.sub(r'[^\w\-_\. ]', '_', title)
|
18 |
|
19 |
-
# π¬ Animated Banner Messages π¬
|
20 |
-
def animated_banner(message, emoji):
|
21 |
-
"""Create an animated banner message. It's like a tiny parade for your console! ππ©"""
|
22 |
-
frames = [
|
23 |
-
f"βββββ {emoji} βββββ\nβ {message:^16} β\nββββββββββββββ",
|
24 |
-
f"βββββ {emoji} βββββ\nβ {message:^16} β\nββββββββββββββ",
|
25 |
-
f"βββββ{emoji}βββββ\nβ {message:^14} β\nββββββββββββ",
|
26 |
-
f"ββββ{emoji}ββββ\nβ {message:^12} β\nββββββββββ",
|
27 |
-
f"βββ{emoji}βββ\nβ {message:^10} β\nββββββββ",
|
28 |
-
f"ββ{emoji}ββ\nβ {message:^8} β\nββββββ",
|
29 |
-
f"β{emoji}β\nβ {message:^6} β\nββββ",
|
30 |
-
]
|
31 |
-
return frames
|
32 |
-
|
33 |
# π΅οΈββοΈ Data Fetching and Caching Shenanigans π΅οΈββοΈ
|
34 |
|
35 |
def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
@@ -65,19 +54,8 @@ def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
|
65 |
|
66 |
Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '') if ppr.find('span', class_='badge badge-secondary') else "0"
|
67 |
|
68 |
-
pdf_link = ''
|
69 |
-
try:
|
70 |
-
response_link = session.get(link, headers=headers)
|
71 |
-
soup_link = BeautifulSoup(response_link.text, 'html.parser')
|
72 |
-
paper_info_link = soup_link.find_all('div', class_='paper-abstract')
|
73 |
-
pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
|
74 |
-
except Exception as e:
|
75 |
-
print(f"Failed to retrieve PDF link for {title}: {e}")
|
76 |
-
|
77 |
-
print(f"Title: {title}, Link: {link}, Github Star: {Github_Star}, PDF Link: {pdf_link}")
|
78 |
-
|
79 |
if title not in data_list:
|
80 |
-
data_list[title] = {'link': link, 'Github Star': int(Github_Star), '
|
81 |
else:
|
82 |
break_duplicate -= 1
|
83 |
if break_duplicate == 0:
|
@@ -112,91 +90,53 @@ def load_and_cache_data(url, cache_file):
|
|
112 |
save_cached_data(new_data, cache_file)
|
113 |
return new_data
|
114 |
|
115 |
-
#
|
116 |
-
|
117 |
-
def
|
118 |
-
"""
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
""
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
"
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
try:
|
164 |
-
timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
|
165 |
-
async with session.get(link_url, timeout=timeout) as response:
|
166 |
-
if response.status != 200:
|
167 |
-
return f"π¨ Failed to grab webpage for {title}: HTTP {response.status}. The internet gremlins strike again! πΉ", None, None
|
168 |
-
page_content = await response.text()
|
169 |
-
|
170 |
-
# Combine the content as a Python type representation
|
171 |
-
code_block = f'"""\nTitle: {title}\nLink: {link_url}\n"""\n\n# Webpage Content\n{repr(page_content)}\n'
|
172 |
-
|
173 |
-
return f"π Successfully downloaded webpage for: {title}.", code_block, page_content
|
174 |
-
except asyncio.TimeoutError:
|
175 |
-
return f"β³ Timeout for {title}. The webpage is playing hard to get! π", None, None
|
176 |
-
except Exception as e:
|
177 |
-
return f"π₯ Oops! Error downloading {title}: {str(e)}. Gremlins in the system! π οΈ", None, None
|
178 |
-
|
179 |
-
async def process_webpages(data, progress=gr.Progress()):
|
180 |
-
"""Process multiple papers asynchronously by downloading their webpages. π€ΉββοΈπ"""
|
181 |
-
async with aiohttp.ClientSession() as session:
|
182 |
-
tasks = []
|
183 |
-
for title, paper_info in data.items():
|
184 |
-
task = asyncio.ensure_future(download_webpage(session, title, paper_info))
|
185 |
-
tasks.append(task)
|
186 |
-
|
187 |
-
results = []
|
188 |
-
codes = []
|
189 |
-
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
190 |
-
result, code_block, page_content = await task
|
191 |
-
results.append(result)
|
192 |
-
if code_block:
|
193 |
-
codes.append(code_block)
|
194 |
-
progress(i / len(tasks), f"π Processed {i}/{len(tasks)} papers. Downloading...")
|
195 |
-
|
196 |
-
return results, codes
|
197 |
-
|
198 |
-
def download_all_webpages(progress=gr.Progress()):
|
199 |
-
"""Download and display all paper webpages. It's like hosting a web party, and everyone's invited! ππ"""
|
200 |
all_data = {}
|
201 |
for category in ["top", "latest", "greatest"]:
|
202 |
cache_file = f"{category}_papers_cache.json"
|
@@ -204,19 +144,26 @@ def download_all_webpages(progress=gr.Progress()):
|
|
204 |
if data:
|
205 |
all_data.update(data)
|
206 |
|
207 |
-
|
208 |
-
results, code_blocks = asyncio.run(process_webpages(all_data, progress))
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
summary += f"β Errors: {len(results) - len(code_blocks)} (Even superheroes have off days)\n\n"
|
213 |
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
# π Gradio Interface: Where the Magic Happens π
|
217 |
|
218 |
with gr.Blocks() as demo:
|
219 |
-
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
220 |
|
221 |
with gr.Tab("Top Trending Papers"):
|
222 |
top_count = gr.Textbox(label="Number of Papers Fetched")
|
@@ -236,14 +183,15 @@ with gr.Blocks() as demo:
|
|
236 |
greatest_button = gr.Button("Refresh Leaderboard")
|
237 |
greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
|
|
243 |
|
244 |
# Load initial data for all tabs
|
245 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
246 |
|
247 |
# π Launch the Gradio interface with a public link
|
248 |
-
print("π Launching the Papers Leaderboard! Get ready
|
249 |
demo.launch(share=True)
|
|
|
5 |
import time
|
6 |
import os
|
7 |
import json
|
|
|
|
|
8 |
import re
|
9 |
from datetime import datetime
|
10 |
+
import torch
|
11 |
+
from transformers import AutoTokenizer, AutoModel
|
12 |
+
import networkx as nx
|
13 |
+
from pyvis.network import Network
|
14 |
+
import matplotlib.pyplot as plt
|
15 |
|
16 |
# π§ββοΈ Magical Utility Functions π§ββοΈ
|
17 |
|
|
|
19 |
"""Convert a string to a safe filename. No more 'file not found' nightmares! π
ββοΈπ"""
|
20 |
return re.sub(r'[^\w\-_\. ]', '_', title)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# π΅οΈββοΈ Data Fetching and Caching Shenanigans π΅οΈββοΈ
|
23 |
|
24 |
def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
|
|
|
54 |
|
55 |
Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '') if ppr.find('span', class_='badge badge-secondary') else "0"
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
if title not in data_list:
|
58 |
+
data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'title': title}
|
59 |
else:
|
60 |
break_duplicate -= 1
|
61 |
if break_duplicate == 0:
|
|
|
90 |
save_cached_data(new_data, cache_file)
|
91 |
return new_data
|
92 |
|
93 |
+
# π Transformer-based Word and Context Analysis π
|
94 |
+
|
95 |
+
def generate_embeddings(titles):
|
96 |
+
"""Generate word embeddings using a transformer model."""
|
97 |
+
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
98 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
99 |
+
model = AutoModel.from_pretrained(model_name)
|
100 |
+
|
101 |
+
embeddings = []
|
102 |
+
with torch.no_grad():
|
103 |
+
for title in titles:
|
104 |
+
tokens = tokenizer(title, return_tensors="pt", padding=True, truncation=True)
|
105 |
+
output = model(**tokens)
|
106 |
+
embeddings.append(output.last_hidden_state.mean(dim=1).squeeze())
|
107 |
+
|
108 |
+
return embeddings
|
109 |
+
|
110 |
+
def build_graph(titles, embeddings, threshold=0.7):
|
111 |
+
"""Build a graph of words based on similarity between titles."""
|
112 |
+
G = nx.Graph()
|
113 |
+
|
114 |
+
for i, title in enumerate(titles):
|
115 |
+
G.add_node(i, label=title)
|
116 |
+
|
117 |
+
for i in range(len(embeddings)):
|
118 |
+
for j in range(i+1, len(embeddings)):
|
119 |
+
sim = torch.cosine_similarity(embeddings[i], embeddings[j], dim=0).item()
|
120 |
+
if sim > threshold:
|
121 |
+
G.add_edge(i, j, weight=sim)
|
122 |
+
|
123 |
+
return G
|
124 |
+
|
125 |
+
def visualize_graph(G, titles):
|
126 |
+
"""Visualize the graph using pyvis and show it as a mind map."""
|
127 |
+
net = Network(height="750px", width="100%", notebook=True)
|
128 |
+
|
129 |
+
for node in G.nodes(data=True):
|
130 |
+
net.add_node(node[0], label=titles[node[0]])
|
131 |
+
|
132 |
+
for edge in G.edges(data=True):
|
133 |
+
net.add_edge(edge[0], edge[1], value=edge[2]['weight'])
|
134 |
+
|
135 |
+
net.show("paper_network.html")
|
136 |
+
return "paper_network.html"
|
137 |
+
|
138 |
+
def analyze_and_generate_graph(progress=gr.Progress()):
|
139 |
+
"""Analyze papers, generate embeddings, and visualize the relationship graph."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
all_data = {}
|
141 |
for category in ["top", "latest", "greatest"]:
|
142 |
cache_file = f"{category}_papers_cache.json"
|
|
|
144 |
if data:
|
145 |
all_data.update(data)
|
146 |
|
147 |
+
titles = [paper['title'] for paper in all_data.values()]
|
|
|
148 |
|
149 |
+
# Generate embeddings
|
150 |
+
embeddings = generate_embeddings(titles)
|
|
|
151 |
|
152 |
+
# Build a similarity graph based on the embeddings
|
153 |
+
G = build_graph(titles, embeddings)
|
154 |
+
|
155 |
+
# Visualize the graph as a mind map
|
156 |
+
graph_file = visualize_graph(G, titles)
|
157 |
+
|
158 |
+
summary = f"π Papers analyzed: {len(titles)}\n"
|
159 |
+
summary += f"β
Graph generated and visualized.\n"
|
160 |
+
|
161 |
+
return summary, graph_file
|
162 |
|
163 |
# π Gradio Interface: Where the Magic Happens π
|
164 |
|
165 |
with gr.Blocks() as demo:
|
166 |
+
gr.Markdown("<h1><center>Papers Leaderboard with Context Analysis</center></h1>")
|
167 |
|
168 |
with gr.Tab("Top Trending Papers"):
|
169 |
top_count = gr.Textbox(label="Number of Papers Fetched")
|
|
|
183 |
greatest_button = gr.Button("Refresh Leaderboard")
|
184 |
greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_html])
|
185 |
|
186 |
+
analyze_button = gr.Button("π Analyze and Generate Graph", variant="primary")
|
187 |
+
analyze_output = gr.Textbox(label="Analysis Status")
|
188 |
+
graph_output = gr.HTML(label="Graph Visualization")
|
189 |
+
|
190 |
+
analyze_button.click(fn=analyze_and_generate_graph, inputs=None, outputs=[analyze_output, graph_output])
|
191 |
|
192 |
# Load initial data for all tabs
|
193 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
194 |
|
195 |
# π Launch the Gradio interface with a public link
|
196 |
+
print("π Launching the Papers Leaderboard with Context Analysis! Get ready to explore the relationships between papers! π’π")
|
197 |
demo.launch(share=True)
|