awacke1 commited on
Commit
a2f0fdc
·
verified ·
1 Parent(s): 581c1bb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import gradio as gr
5
+ import time
6
+ import os
7
+ import json
8
+
9
+ def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
10
+ base_url = "https://paperswithcode.com"
11
+ session = requests.Session()
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
14
+ 'Cache-Control': 'no-cache'
15
+ }
16
+ print("Time run at : ", time.ctime())
17
+ offset = 0
18
+ data_list = {}
19
+ break_duplicate = 10
20
+
21
+ while True:
22
+ response = session.get(url, headers=headers, params={'page': offset})
23
+ if response.status_code != 200:
24
+ print('Failed to retrieve data')
25
+ break
26
+ soup = BeautifulSoup(response.text, 'html.parser')
27
+ paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
28
+ if not paper_info:
29
+ break
30
+ for ppr in paper_info:
31
+ title = ppr.find('h1').text.strip()
32
+
33
+ if "paper" in ppr.find('a')['href']:
34
+ link = base_url + ppr.find('a')['href']
35
+ else:
36
+ link = ppr.find('a')['href']
37
+ Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
38
+ pdf_link = ''
39
+ try:
40
+ response_link = session.get(link, headers=headers)
41
+ soup_link = BeautifulSoup(response_link.text, 'html.parser')
42
+ paper_info_link = soup_link.find_all('div', class_='paper-abstract')
43
+ pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
44
+ except:
45
+ pass
46
+ if title not in data_list:
47
+ data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
48
+ else:
49
+ break_duplicate -= 1
50
+ if break_duplicate == 0:
51
+ return data_list
52
+ offset += 1
53
+ progress.update(offset)
54
+ print('Data retrieval complete')
55
+ return data_list
56
+
57
+ def load_cached_data(cache_file):
58
+ if os.path.exists(cache_file):
59
+ with open(cache_file, 'r') as f:
60
+ return json.load(f)
61
+ return None
62
+
63
+ def save_cached_data(data, cache_file):
64
+ with open(cache_file, 'w') as f:
65
+ json.dump(data, f)
66
+
67
+ def format_dataframe(data):
68
+ df = pd.DataFrame(data).T
69
+ df['title'] = df.index
70
+ df = df[['title', 'Github Star', 'link', 'pdf_link']]
71
+ return df
72
+
73
+ def load_and_cache_data(url, cache_file):
74
+ cached_data = load_cached_data(cache_file)
75
+
76
+ if cached_data:
77
+ print(f"Loading cached data from {cache_file}")
78
+ return cached_data
79
+
80
+ print(f"Fetching new data from {url}")
81
+ new_data = get_rank_papers(url)
82
+ save_cached_data(new_data, cache_file)
83
+ return new_data
84
+
85
+ def update_display(category):
86
+ cache_file = f"{category}_papers_cache.json"
87
+ url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
88
+
89
+ data = load_and_cache_data(url, cache_file)
90
+ df = format_dataframe(data)
91
+
92
+ return len(df), df
93
+
94
+ def load_all_data():
95
+ top_count, top_df = update_display("top")
96
+ new_count, new_df = update_display("latest")
97
+ greatest_count, greatest_df = update_display("greatest")
98
+ return top_count, top_df, new_count, new_df, greatest_count, greatest_df
99
+
100
+ def save_dataframe_generic(df, filename):
101
+ try:
102
+ df.to_csv(filename, index=False)
103
+ return "Dataframe saved successfully."
104
+ except Exception as e:
105
+ return f"Error saving dataframe: {e}"
106
+
107
+ def load_dataframe_generic(filename):
108
+ try:
109
+ if os.path.exists(filename):
110
+ df = pd.read_csv(filename)
111
+ return df, "Dataframe loaded successfully."
112
+ else:
113
+ return pd.DataFrame(), "Dataframe file not found."
114
+ except Exception as e:
115
+ return pd.DataFrame(), f"Error loading dataframe: {e}"
116
+
117
+ with gr.Blocks() as demo:
118
+ gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
119
+
120
+ with gr.Tab("Top Trending Papers"):
121
+ top_count = gr.Textbox(label="Number of Papers Fetched")
122
+ top_df = gr.DataFrame(interactive=True)
123
+ top_button = gr.Button("Refresh Leaderboard")
124
+ top_load_button = gr.Button("Load Dataframe")
125
+ top_save_button = gr.Button("Save Dataframe")
126
+ top_save_status = gr.Textbox(label="Status")
127
+
128
+ top_button.click(fn=lambda: update_display("top"), inputs=None, outputs=[top_count, top_df])
129
+ top_save_button.click(fn=lambda df: save_dataframe_generic(df, 'top_dataframe.csv'), inputs=top_df, outputs=top_save_status)
130
+ top_load_button.click(fn=lambda: load_dataframe_generic('top_dataframe.csv'), inputs=None, outputs=[top_df, top_save_status])
131
+
132
+ with gr.Tab("New Papers"):
133
+ new_count = gr.Textbox(label="Number of Papers Fetched")
134
+ new_df = gr.DataFrame(interactive=True)
135
+ new_button = gr.Button("Refresh Leaderboard")
136
+ new_load_button = gr.Button("Load Dataframe")
137
+ new_save_button = gr.Button("Save Dataframe")
138
+ new_save_status = gr.Textbox(label="Status")
139
+
140
+ new_button.click(fn=lambda: update_display("latest"), inputs=None, outputs=[new_count, new_df])
141
+ new_save_button.click(fn=lambda df: save_dataframe_generic(df, 'new_dataframe.csv'), inputs=new_df, outputs=new_save_status)
142
+ new_load_button.click(fn=lambda: load_dataframe_generic('new_dataframe.csv'), inputs=None, outputs=[new_df, new_save_status])
143
+
144
+ with gr.Tab("Greatest Papers"):
145
+ greatest_count = gr.Textbox(label="Number of Papers Fetched")
146
+ greatest_df = gr.DataFrame(interactive=True)
147
+ greatest_button = gr.Button("Refresh Leaderboard")
148
+ greatest_load_button = gr.Button("Load Dataframe")
149
+ greatest_save_button = gr.Button("Save Dataframe")
150
+ greatest_save_status = gr.Textbox(label="Status")
151
+
152
+ greatest_button.click(fn=lambda: update_display("greatest"), inputs=None, outputs=[greatest_count, greatest_df])
153
+ greatest_save_button.click(fn=lambda df: save_dataframe_generic(df, 'greatest_dataframe.csv'), inputs=greatest_df, outputs=greatest_save_status)
154
+ greatest_load_button.click(fn=lambda: load_dataframe_generic('greatest_dataframe.csv'), inputs=None, outputs=[greatest_df, greatest_save_status])
155
+
156
+ # Load initial data for all tabs
157
+ demo.load(fn=load_all_data, outputs=[top_count, top_df, new_count, new_df, greatest_count, greatest_df])
158
+
159
+ # Launch the Gradio interface with a public link
160
+ demo.launch(share=True)