import requests from bs4 import BeautifulSoup import pandas as pd import gradio as gr import time def get_rank_papers(url,progress=gr.Progress(track_tqdm=True)): base_url = "https://paperswithcode.com" session = requests.Session() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 'Cache-Control': 'no-cache' } print("Time run at : ",time.ctime()) offset = 0 data_list = {} break_duplicate = 10 while True: response = session.get(url, headers=headers, params={'page': offset}) if response.status_code != 200: print('Failed to retrieve data') break soup = BeautifulSoup(response.text, 'html.parser') paper_info = soup.find_all('div', class_='row infinite-item item paper-card') if not paper_info: break for ppr in paper_info: title = ppr.find('h1').text.strip() if "paper" in ppr.find('a')['href']: link = base_url + ppr.find('a')['href'] else: link = ppr.find('a')['href'] Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '') pdf_link = '' try: response_link = session.get(link, headers=headers) soup_link = BeautifulSoup(response_link.text, 'html.parser') paper_info_link = soup_link.find_all('div', class_='paper-abstract') pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href'] except: pass if title not in data_list: data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()} else: break_duplicate -= 1 if break_duplicate == 0: return data_list offset += 1 progress.update(offset) print('Data retrieval complete') return data_list def reload_gradio_top(): # Retrieve rank papers print('Retrieving data .. top papers') rank_paper_df = get_rank_papers(url="https://paperswithcode.com/") rank_paper_df = dict(sorted(rank_paper_df.items(), key=lambda x: x[1]['Github Star'], reverse=True)) rank_paper_df = pd.DataFrame(rank_paper_df).T # Add title column rank_paper_df['title'] = rank_paper_df.index rank_paper_df = rank_paper_df[['title', 'Github Star', 'link', 'pdf_link']] # Convert link column to HTML links rank_paper_df['link'] = rank_paper_df['link'].apply(lambda x: f'Link') rank_paper_df['pdf_link'] = rank_paper_df['pdf_link'].apply(lambda x: f'{x}') rank_paper = rank_paper_df.to_html(escape=False, index=False) return len(rank_paper_df),rank_paper def reload_gradio_new(): # Retrieve rank papers print('Retrieving data .. new papers') rank_paper_df = get_rank_papers(url="https://paperswithcode.com/latest") rank_paper_df = dict(sorted(rank_paper_df.items(), key=lambda x: x[1]['Github Star'], reverse=True)) rank_paper_df = pd.DataFrame(rank_paper_df).T # Add title column rank_paper_df['title'] = rank_paper_df.index rank_paper_df = rank_paper_df[['title', 'Github Star', 'link', 'pdf_link']] # Convert link column to HTML links rank_paper_df['link'] = rank_paper_df['link'].apply(lambda x: f'Link') rank_paper_df['pdf_link'] = rank_paper_df['pdf_link'].apply(lambda x: f'{x}') rank_paper = rank_paper_df.to_html(escape=False, index=False) return len(rank_paper_df),rank_paper def reload_gradio_greatest(): # Retrieve rank papers print('Retrieving data .. greatest papers') rank_paper_df = get_rank_papers(url="https://paperswithcode.com/greatest") rank_paper_df = dict(sorted(rank_paper_df.items(), key=lambda x: x[1]['Github Star'], reverse=True)) rank_paper_df = pd.DataFrame(rank_paper_df).T # Add title column rank_paper_df['title'] = rank_paper_df.index rank_paper_df = rank_paper_df[['title', 'Github Star', 'link', 'pdf_link']] # Convert link column to HTML links rank_paper_df['link'] = rank_paper_df['link'].apply(lambda x: f'Link') rank_paper_df['pdf_link'] = rank_paper_df['pdf_link'].apply(lambda x: f'{x}') rank_paper = rank_paper_df.to_html(escape=False, index=False) return len(rank_paper_df),rank_paper with gr.Blocks() as demo: gr.Markdown("