awacke1 commited on
Commit
fe2ce4d
·
verified ·
1 Parent(s): fd22ee8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -4
app.py CHANGED
@@ -6,7 +6,83 @@ import time
6
  import os
7
  import json
8
 
9
- # ... (keep all the previous functions unchanged)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def update_display(category):
12
  cache_file = f"{category}_papers_cache.json"
@@ -47,6 +123,5 @@ with gr.Blocks() as demo:
47
  # Load initial data for all tabs
48
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
49
 
50
- # Launch the Gradio interface
51
- demo.launch()
52
-
 
6
  import os
7
  import json
8
 
9
+ def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
10
+ base_url = "https://paperswithcode.com"
11
+ session = requests.Session()
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
14
+ 'Cache-Control': 'no-cache'
15
+ }
16
+ print("Time run at : ", time.ctime())
17
+ offset = 0
18
+ data_list = {}
19
+ break_duplicate = 10
20
+
21
+ while True:
22
+ response = session.get(url, headers=headers, params={'page': offset})
23
+ if response.status_code != 200:
24
+ print('Failed to retrieve data')
25
+ break
26
+ soup = BeautifulSoup(response.text, 'html.parser')
27
+ paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
28
+ if not paper_info:
29
+ break
30
+ for ppr in paper_info:
31
+ title = ppr.find('h1').text.strip()
32
+
33
+ if "paper" in ppr.find('a')['href']:
34
+ link = base_url + ppr.find('a')['href']
35
+ else:
36
+ link = ppr.find('a')['href']
37
+ Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
38
+ pdf_link = ''
39
+ try:
40
+ response_link = session.get(link, headers=headers)
41
+ soup_link = BeautifulSoup(response_link.text, 'html.parser')
42
+ paper_info_link = soup_link.find_all('div', class_='paper-abstract')
43
+ pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
44
+ except:
45
+ pass
46
+ if title not in data_list:
47
+ data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
48
+ else:
49
+ break_duplicate -= 1
50
+ if break_duplicate == 0:
51
+ return data_list
52
+ offset += 1
53
+ progress.update(offset)
54
+ print('Data retrieval complete')
55
+ return data_list
56
+
57
+ def load_cached_data(cache_file):
58
+ if os.path.exists(cache_file):
59
+ with open(cache_file, 'r') as f:
60
+ return json.load(f)
61
+ return None
62
+
63
+ def save_cached_data(data, cache_file):
64
+ with open(cache_file, 'w') as f:
65
+ json.dump(data, f)
66
+
67
+ def format_dataframe(data):
68
+ df = pd.DataFrame(data).T
69
+ df['title'] = df.index
70
+ df = df[['title', 'Github Star', 'link', 'pdf_link']]
71
+ df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
72
+ df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
73
+ return df
74
+
75
+ def load_and_cache_data(url, cache_file):
76
+ cached_data = load_cached_data(cache_file)
77
+
78
+ if cached_data:
79
+ print(f"Loading cached data from {cache_file}")
80
+ return cached_data
81
+
82
+ print(f"Fetching new data from {url}")
83
+ new_data = get_rank_papers(url)
84
+ save_cached_data(new_data, cache_file)
85
+ return new_data
86
 
87
  def update_display(category):
88
  cache_file = f"{category}_papers_cache.json"
 
123
  # Load initial data for all tabs
124
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
125
 
126
+ # Launch the Gradio interface with a public link
127
+ demo.launch(share=True)