Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,185 +13,74 @@ import aiofiles
|
|
13 |
from concurrent.futures import ThreadPoolExecutor
|
14 |
import re
|
15 |
from datetime import datetime
|
16 |
-
import zipfile
|
17 |
import base64
|
18 |
|
19 |
-
|
20 |
-
base_url = "https://paperswithcode.com"
|
21 |
-
session = requests.Session()
|
22 |
-
headers = {
|
23 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
24 |
-
'Cache-Control': 'no-cache'
|
25 |
-
}
|
26 |
-
print("Time run at : ", time.ctime())
|
27 |
-
offset = 0
|
28 |
-
data_list = {}
|
29 |
-
break_duplicate = 10
|
30 |
-
|
31 |
-
while True:
|
32 |
-
response = session.get(url, headers=headers, params={'page': offset})
|
33 |
-
if response.status_code != 200:
|
34 |
-
print('Failed to retrieve data')
|
35 |
-
break
|
36 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
37 |
-
paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
|
38 |
-
if not paper_info:
|
39 |
-
break
|
40 |
-
for ppr in paper_info:
|
41 |
-
title = ppr.find('h1').text.strip()
|
42 |
-
|
43 |
-
if "paper" in ppr.find('a')['href']:
|
44 |
-
link = base_url + ppr.find('a')['href']
|
45 |
-
else:
|
46 |
-
link = ppr.find('a')['href']
|
47 |
-
Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
|
48 |
-
pdf_link = ''
|
49 |
-
try:
|
50 |
-
response_link = session.get(link, headers=headers)
|
51 |
-
soup_link = BeautifulSoup(response_link.text, 'html.parser')
|
52 |
-
paper_info_link = soup_link.find_all('div', class_='paper-abstract')
|
53 |
-
pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
|
54 |
-
except:
|
55 |
-
pass
|
56 |
-
if title not in data_list:
|
57 |
-
data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
|
58 |
-
else:
|
59 |
-
break_duplicate -= 1
|
60 |
-
if break_duplicate == 0:
|
61 |
-
return data_list
|
62 |
-
offset += 1
|
63 |
-
progress.update(offset)
|
64 |
-
print('Data retrieval complete')
|
65 |
-
return data_list
|
66 |
-
|
67 |
-
def load_cached_data(cache_file):
|
68 |
-
if os.path.exists(cache_file):
|
69 |
-
with open(cache_file, 'r') as f:
|
70 |
-
return json.load(f)
|
71 |
-
return None
|
72 |
-
|
73 |
-
def save_cached_data(data, cache_file):
|
74 |
-
with open(cache_file, 'w') as f:
|
75 |
-
json.dump(data, f)
|
76 |
-
|
77 |
-
def format_dataframe(data):
|
78 |
-
df = pd.DataFrame(data).T
|
79 |
-
df['title'] = df.index
|
80 |
-
df = df[['title', 'Github Star', 'link', 'pdf_link']]
|
81 |
-
# Sort the dataframe by 'Github Star' in descending order
|
82 |
-
df = df.sort_values(by='Github Star', ascending=False)
|
83 |
-
df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
|
84 |
-
df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
|
85 |
-
return df
|
86 |
-
|
87 |
-
def load_and_cache_data(url, cache_file):
|
88 |
-
cached_data = load_cached_data(cache_file)
|
89 |
-
|
90 |
-
if cached_data:
|
91 |
-
print(f"Loading cached data from {cache_file}")
|
92 |
-
return cached_data
|
93 |
-
|
94 |
-
print(f"Fetching new data from {url}")
|
95 |
-
new_data = get_rank_papers(url)
|
96 |
-
save_cached_data(new_data, cache_file)
|
97 |
-
return new_data
|
98 |
-
|
99 |
-
def update_display(category):
|
100 |
-
cache_file = f"{category}_papers_cache.json"
|
101 |
-
url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
|
102 |
-
|
103 |
-
data = load_and_cache_data(url, cache_file)
|
104 |
-
df = format_dataframe(data)
|
105 |
-
|
106 |
-
return len(df), df.to_html(escape=False, index=False)
|
107 |
-
|
108 |
-
def load_all_data():
|
109 |
-
top_count, top_html = update_display("top")
|
110 |
-
new_count, new_html = update_display("latest")
|
111 |
-
greatest_count, greatest_html = update_display("greatest")
|
112 |
-
return top_count, top_html, new_count, new_html, greatest_count, greatest_html
|
113 |
-
|
114 |
-
def safe_filename(title):
|
115 |
-
"""Convert a string to a safe filename."""
|
116 |
-
return re.sub(r'[^\w\-_\. ]', '_', title)
|
117 |
|
118 |
-
def
|
119 |
-
"""Create a directory named with the current date."""
|
120 |
-
date_str = datetime.now().strftime("%Y-%m-%d")
|
121 |
-
os.makedirs(date_str, exist_ok=True)
|
122 |
-
return date_str
|
123 |
-
|
124 |
-
async def download_and_save_pdf(session, title, paper_info, directory):
|
125 |
pdf_url = paper_info['pdf_link']
|
126 |
if not pdf_url:
|
127 |
-
return f"No PDF link available for: {title}", None
|
128 |
|
129 |
try:
|
130 |
timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
|
131 |
async with session.get(pdf_url, timeout=timeout) as response:
|
132 |
if response.status != 200:
|
133 |
-
return f"Failed to download PDF for {title}: HTTP {response.status}", None
|
134 |
pdf_content = await response.read()
|
135 |
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
safe_title = safe_filename(title)
|
140 |
-
|
141 |
-
|
142 |
|
143 |
-
async with aiofiles.open(
|
144 |
-
await f.write(
|
145 |
|
146 |
-
return f"Successfully
|
147 |
except asyncio.TimeoutError:
|
148 |
-
return f"Timeout while downloading PDF for {title}", None
|
149 |
except Exception as e:
|
150 |
-
return f"Error
|
151 |
|
152 |
async def process_papers(data, directory, progress=gr.Progress()):
|
153 |
async with aiohttp.ClientSession() as session:
|
154 |
tasks = []
|
155 |
for title, paper_info in data.items():
|
156 |
-
task = asyncio.ensure_future(
|
157 |
tasks.append(task)
|
158 |
|
159 |
results = []
|
160 |
successful_downloads = []
|
161 |
errors = []
|
162 |
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
163 |
-
result, filepath = await task
|
164 |
results.append(result)
|
165 |
-
if filepath:
|
166 |
-
successful_downloads.append(filepath)
|
167 |
else:
|
168 |
errors.append(result)
|
169 |
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
170 |
|
171 |
return results, successful_downloads, errors
|
172 |
|
173 |
-
def
|
174 |
-
"""
|
175 |
-
|
176 |
-
|
177 |
-
for file in files_to_zip:
|
178 |
-
zipf.write(file, os.path.relpath(file, os.path.join(directory, '..')))
|
179 |
-
return zip_filename
|
180 |
-
|
181 |
-
def get_base64_download_link(file_path):
|
182 |
-
"""Create a base64 download link for a file."""
|
183 |
-
with open(file_path, "rb") as file:
|
184 |
-
content = file.read()
|
185 |
-
b64 = base64.b64encode(content).decode()
|
186 |
-
return f'<a href="data:application/zip;base64,{b64}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
|
187 |
-
|
188 |
-
def get_existing_zip_links():
|
189 |
-
"""Get download links for existing zip files."""
|
190 |
-
links = []
|
191 |
-
for file in os.listdir('.'):
|
192 |
-
if file.endswith('.zip') and os.path.isfile(file):
|
193 |
-
links.append(get_base64_download_link(file))
|
194 |
-
return "<br>".join(links)
|
195 |
|
196 |
def download_all_papers(progress=gr.Progress()):
|
197 |
all_data = {}
|
@@ -204,20 +93,19 @@ def download_all_papers(progress=gr.Progress()):
|
|
204 |
date_directory = create_date_directory()
|
205 |
results, successful_downloads, errors = asyncio.run(process_papers(all_data, date_directory, progress))
|
206 |
|
207 |
-
if successful_downloads:
|
208 |
-
zip_file = zip_directory(successful_downloads, date_directory)
|
209 |
-
download_link = get_base64_download_link(zip_file)
|
210 |
-
else:
|
211 |
-
download_link = "No papers were successfully downloaded."
|
212 |
-
|
213 |
-
existing_links = get_existing_zip_links()
|
214 |
-
|
215 |
summary = f"Papers processed: {len(all_data)}\n"
|
216 |
-
summary += f"Successfully downloaded: {len(successful_downloads)}\n"
|
217 |
summary += f"Errors: {len(errors)}\n\n"
|
218 |
summary += "Error List:\n" + "\n".join(errors) if errors else "No errors occurred."
|
219 |
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
with gr.Blocks() as demo:
|
223 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
@@ -243,7 +131,8 @@ with gr.Blocks() as demo:
|
|
243 |
download_button = gr.Button("📚 Download All Papers", variant="primary")
|
244 |
download_output = gr.Textbox(label="Download Status")
|
245 |
download_links = gr.HTML(label="Download Links")
|
246 |
-
|
|
|
247 |
|
248 |
# Load initial data for all tabs
|
249 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
|
|
13 |
from concurrent.futures import ThreadPoolExecutor
|
14 |
import re
|
15 |
from datetime import datetime
|
|
|
16 |
import base64
|
17 |
|
18 |
+
# ... (keep all the previous functions up to create_date_directory)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
async def download_and_process_pdf(session, title, paper_info, directory):
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
pdf_url = paper_info['pdf_link']
|
22 |
if not pdf_url:
|
23 |
+
return f"No PDF link available for: {title}", None, None
|
24 |
|
25 |
try:
|
26 |
timeout = aiohttp.ClientTimeout(total=60) # 60 seconds timeout
|
27 |
async with session.get(pdf_url, timeout=timeout) as response:
|
28 |
if response.status != 200:
|
29 |
+
return f"Failed to download PDF for {title}: HTTP {response.status}", None, None
|
30 |
pdf_content = await response.read()
|
31 |
|
32 |
+
file_length = len(pdf_content)
|
33 |
+
if file_length < 5000: # Check if the PDF is less than 5KB
|
34 |
+
return f"Downloaded PDF for {title} is too small ({file_length} bytes). Skipping.", None, None
|
35 |
+
|
36 |
+
# Convert PDF to text
|
37 |
+
pdf_file = io.BytesIO(pdf_content)
|
38 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
39 |
+
text = ""
|
40 |
+
for page in pdf_reader.pages:
|
41 |
+
text += page.extract_text()
|
42 |
+
|
43 |
+
if len(text) < 5000: # Check if the extracted text is less than 5KB
|
44 |
+
return f"Extracted text for {title} is too small ({len(text)} characters). Skipping.", None, None
|
45 |
|
46 |
safe_title = safe_filename(title)
|
47 |
+
txt_filename = f"{safe_title}.txt"
|
48 |
+
txt_filepath = os.path.join(directory, txt_filename)
|
49 |
|
50 |
+
async with aiofiles.open(txt_filepath, 'w', encoding='utf-8') as f:
|
51 |
+
await f.write(text)
|
52 |
|
53 |
+
return f"Successfully processed: {txt_filename} (File length: {file_length} bytes)", txt_filepath, text
|
54 |
except asyncio.TimeoutError:
|
55 |
+
return f"Timeout while downloading PDF for {title}", None, None
|
56 |
except Exception as e:
|
57 |
+
return f"Error processing PDF for {title}: {str(e)}", None, None
|
58 |
|
59 |
async def process_papers(data, directory, progress=gr.Progress()):
|
60 |
async with aiohttp.ClientSession() as session:
|
61 |
tasks = []
|
62 |
for title, paper_info in data.items():
|
63 |
+
task = asyncio.ensure_future(download_and_process_pdf(session, title, paper_info, directory))
|
64 |
tasks.append(task)
|
65 |
|
66 |
results = []
|
67 |
successful_downloads = []
|
68 |
errors = []
|
69 |
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
70 |
+
result, filepath, text = await task
|
71 |
results.append(result)
|
72 |
+
if filepath and text:
|
73 |
+
successful_downloads.append((filepath, text))
|
74 |
else:
|
75 |
errors.append(result)
|
76 |
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
77 |
|
78 |
return results, successful_downloads, errors
|
79 |
|
80 |
+
def get_base64_download_link(content, filename):
|
81 |
+
"""Create a base64 download link for text content."""
|
82 |
+
b64 = base64.b64encode(content.encode()).decode()
|
83 |
+
return f'<a href="data:text/plain;base64,{b64}" download="{filename}">Download {filename}</a>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
def download_all_papers(progress=gr.Progress()):
|
86 |
all_data = {}
|
|
|
93 |
date_directory = create_date_directory()
|
94 |
results, successful_downloads, errors = asyncio.run(process_papers(all_data, date_directory, progress))
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
summary = f"Papers processed: {len(all_data)}\n"
|
97 |
+
summary += f"Successfully downloaded and converted: {len(successful_downloads)}\n"
|
98 |
summary += f"Errors: {len(errors)}\n\n"
|
99 |
summary += "Error List:\n" + "\n".join(errors) if errors else "No errors occurred."
|
100 |
|
101 |
+
download_links = []
|
102 |
+
text_contents = []
|
103 |
+
for filepath, text in successful_downloads:
|
104 |
+
filename = os.path.basename(filepath)
|
105 |
+
download_links.append(get_base64_download_link(text, filename))
|
106 |
+
text_contents.append(f"--- {filename} ---\n\n{text[:1000]}...\n\n") # Show first 1000 characters
|
107 |
+
|
108 |
+
return summary, "<br>".join(download_links), "\n".join(text_contents)
|
109 |
|
110 |
with gr.Blocks() as demo:
|
111 |
gr.Markdown("<h1><center>Papers Leaderboard</center></h1>")
|
|
|
131 |
download_button = gr.Button("📚 Download All Papers", variant="primary")
|
132 |
download_output = gr.Textbox(label="Download Status")
|
133 |
download_links = gr.HTML(label="Download Links")
|
134 |
+
text_output = gr.Code(label="Paper Contents", language="text")
|
135 |
+
download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, download_links, text_output])
|
136 |
|
137 |
# Load initial data for all tabs
|
138 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|