awacke1 commited on
Commit
a7988d4
·
verified ·
1 Parent(s): f00ae4c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -2
app.py CHANGED
@@ -13,7 +13,98 @@ import aiohttp
13
  import aiofiles
14
  from concurrent.futures import ThreadPoolExecutor
15
 
16
- # ... (keep the existing functions like get_rank_papers, load_cached_data, save_cached_data, format_dataframe, load_and_cache_data, update_display, load_all_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  async def download_and_convert_pdf(session, title, paper_info):
19
  pdf_url = paper_info['pdf_link']
@@ -74,7 +165,7 @@ def download_all_papers(progress=gr.Progress()):
74
  with open("consolidated_papers.md", "w", encoding="utf-8") as f:
75
  f.write(consolidated_text)
76
 
77
- return "All papers have been downloaded and consolidated into 'consolidated_papers.md'"
78
 
79
  with gr.Blocks() as demo:
80
  gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
 
13
  import aiofiles
14
  from concurrent.futures import ThreadPoolExecutor
15
 
16
+ def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
17
+ base_url = "https://paperswithcode.com"
18
+ session = requests.Session()
19
+ headers = {
20
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
21
+ 'Cache-Control': 'no-cache'
22
+ }
23
+ print("Time run at : ", time.ctime())
24
+ offset = 0
25
+ data_list = {}
26
+ break_duplicate = 10
27
+
28
+ while True:
29
+ response = session.get(url, headers=headers, params={'page': offset})
30
+ if response.status_code != 200:
31
+ print('Failed to retrieve data')
32
+ break
33
+ soup = BeautifulSoup(response.text, 'html.parser')
34
+ paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
35
+ if not paper_info:
36
+ break
37
+ for ppr in paper_info:
38
+ title = ppr.find('h1').text.strip()
39
+
40
+ if "paper" in ppr.find('a')['href']:
41
+ link = base_url + ppr.find('a')['href']
42
+ else:
43
+ link = ppr.find('a')['href']
44
+ Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
45
+ pdf_link = ''
46
+ try:
47
+ response_link = session.get(link, headers=headers)
48
+ soup_link = BeautifulSoup(response_link.text, 'html.parser')
49
+ paper_info_link = soup_link.find_all('div', class_='paper-abstract')
50
+ pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
51
+ except:
52
+ pass
53
+ if title not in data_list:
54
+ data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
55
+ else:
56
+ break_duplicate -= 1
57
+ if break_duplicate == 0:
58
+ return data_list
59
+ offset += 1
60
+ progress.update(offset)
61
+ print('Data retrieval complete')
62
+ return data_list
63
+
64
+ def load_cached_data(cache_file):
65
+ if os.path.exists(cache_file):
66
+ with open(cache_file, 'r') as f:
67
+ return json.load(f)
68
+ return None
69
+
70
+ def save_cached_data(data, cache_file):
71
+ with open(cache_file, 'w') as f:
72
+ json.dump(data, f)
73
+
74
+ def format_dataframe(data):
75
+ df = pd.DataFrame(data).T
76
+ df['title'] = df.index
77
+ df = df[['title', 'Github Star', 'link', 'pdf_link']]
78
+ df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
79
+ df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
80
+ return df
81
+
82
+ def load_and_cache_data(url, cache_file):
83
+ cached_data = load_cached_data(cache_file)
84
+
85
+ if cached_data:
86
+ print(f"Loading cached data from {cache_file}")
87
+ return cached_data
88
+
89
+ print(f"Fetching new data from {url}")
90
+ new_data = get_rank_papers(url)
91
+ save_cached_data(new_data, cache_file)
92
+ return new_data
93
+
94
+ def update_display(category):
95
+ cache_file = f"{category}_papers_cache.json"
96
+ url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
97
+
98
+ data = load_and_cache_data(url, cache_file)
99
+ df = format_dataframe(data)
100
+
101
+ return len(df), df.to_html(escape=False, index=False)
102
+
103
+ def load_all_data():
104
+ top_count, top_html = update_display("top")
105
+ new_count, new_html = update_display("latest")
106
+ greatest_count, greatest_html = update_display("greatest")
107
+ return top_count, top_html, new_count, new_html, greatest_count, greatest_html
108
 
109
  async def download_and_convert_pdf(session, title, paper_info):
110
  pdf_url = paper_info['pdf_link']
 
165
  with open("consolidated_papers.md", "w", encoding="utf-8") as f:
166
  f.write(consolidated_text)
167
 
168
+ return "All papers have been downloaded and consolidated into 'consolidated_papers.md'", consolidated_text
169
 
170
  with gr.Blocks() as demo:
171
  gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")