Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,121 +7,61 @@ import os
|
|
7 |
import json
|
8 |
import PyPDF2
|
9 |
import io
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
base_url = "https://paperswithcode.com"
|
13 |
-
session = requests.Session()
|
14 |
-
headers = {
|
15 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
16 |
-
'Cache-Control': 'no-cache'
|
17 |
-
}
|
18 |
-
print("Time run at : ", time.ctime())
|
19 |
-
offset = 0
|
20 |
-
data_list = {}
|
21 |
-
break_duplicate = 10
|
22 |
-
|
23 |
-
while True:
|
24 |
-
response = session.get(url, headers=headers, params={'page': offset})
|
25 |
-
if response.status_code != 200:
|
26 |
-
print('Failed to retrieve data')
|
27 |
-
break
|
28 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
29 |
-
paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
|
30 |
-
if not paper_info:
|
31 |
-
break
|
32 |
-
for ppr in paper_info:
|
33 |
-
title = ppr.find('h1').text.strip()
|
34 |
-
|
35 |
-
if "paper" in ppr.find('a')['href']:
|
36 |
-
link = base_url + ppr.find('a')['href']
|
37 |
-
else:
|
38 |
-
link = ppr.find('a')['href']
|
39 |
-
Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
|
40 |
-
pdf_link = ''
|
41 |
-
try:
|
42 |
-
response_link = session.get(link, headers=headers)
|
43 |
-
soup_link = BeautifulSoup(response_link.text, 'html.parser')
|
44 |
-
paper_info_link = soup_link.find_all('div', class_='paper-abstract')
|
45 |
-
pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
|
46 |
-
except:
|
47 |
-
pass
|
48 |
-
if title not in data_list:
|
49 |
-
data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
|
50 |
-
else:
|
51 |
-
break_duplicate -= 1
|
52 |
-
if break_duplicate == 0:
|
53 |
-
return data_list
|
54 |
-
offset += 1
|
55 |
-
progress.update(offset)
|
56 |
-
print('Data retrieval complete')
|
57 |
-
return data_list
|
58 |
-
|
59 |
-
def load_cached_data(cache_file):
|
60 |
-
if os.path.exists(cache_file):
|
61 |
-
with open(cache_file, 'r') as f:
|
62 |
-
return json.load(f)
|
63 |
-
return None
|
64 |
-
|
65 |
-
def save_cached_data(data, cache_file):
|
66 |
-
with open(cache_file, 'w') as f:
|
67 |
-
json.dump(data, f)
|
68 |
-
|
69 |
-
def format_dataframe(data):
|
70 |
-
df = pd.DataFrame(data).T
|
71 |
-
df['title'] = df.index
|
72 |
-
df = df[['title', 'Github Star', 'link', 'pdf_link']]
|
73 |
-
df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
|
74 |
-
df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
|
75 |
-
return df
|
76 |
|
77 |
-
def
|
78 |
-
|
|
|
79 |
|
80 |
-
if
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
print(f"Fetching new data from {url}")
|
85 |
-
new_data = get_rank_papers(url)
|
86 |
-
save_cached_data(new_data, cache_file)
|
87 |
-
return new_data
|
88 |
-
|
89 |
-
def update_display(category):
|
90 |
-
cache_file = f"{category}_papers_cache.json"
|
91 |
-
url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
|
92 |
|
93 |
-
|
94 |
-
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
-
def
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
markdown_text = f"# {title}\n\n{text}\n\n---\n\n"
|
118 |
-
consolidated_text += markdown_text
|
119 |
-
except Exception as e:
|
120 |
-
print(f"Error processing PDF for {title}: {str(e)}")
|
121 |
|
122 |
return consolidated_text
|
123 |
|
124 |
-
def download_all_papers():
|
125 |
all_data = {}
|
126 |
for category in ["top", "latest", "greatest"]:
|
127 |
cache_file = f"{category}_papers_cache.json"
|
@@ -129,7 +69,7 @@ def download_all_papers():
|
|
129 |
if data:
|
130 |
all_data.update(data)
|
131 |
|
132 |
-
consolidated_text =
|
133 |
|
134 |
with open("consolidated_papers.md", "w", encoding="utf-8") as f:
|
135 |
f.write(consolidated_text)
|
@@ -159,7 +99,8 @@ with gr.Blocks() as demo:
|
|
159 |
|
160 |
download_button = gr.Button("📚 Download All Papers", variant="primary")
|
161 |
download_output = gr.Textbox(label="Download Status")
|
162 |
-
|
|
|
163 |
|
164 |
# Load initial data for all tabs
|
165 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|
|
|
7 |
import json
|
8 |
import PyPDF2
|
9 |
import io
|
10 |
+
import markdown
|
11 |
+
import asyncio
|
12 |
+
import aiohttp
|
13 |
+
import aiofiles
|
14 |
+
from concurrent.futures import ThreadPoolExecutor
|
15 |
|
16 |
+
# ... (keep the existing functions like get_rank_papers, load_cached_data, save_cached_data, format_dataframe, load_and_cache_data, update_display, load_all_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
async def download_and_convert_pdf(session, title, paper_info):
|
19 |
+
pdf_url = paper_info['pdf_link']
|
20 |
+
cache_file = f"cache/{title.replace(' ', '_')}.md"
|
21 |
|
22 |
+
if os.path.exists(cache_file):
|
23 |
+
async with aiofiles.open(cache_file, 'r') as f:
|
24 |
+
return await f.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
if not pdf_url:
|
27 |
+
return f"# {title}\n\nNo PDF link available.\n\n---\n\n"
|
28 |
|
29 |
+
try:
|
30 |
+
async with session.get(pdf_url) as response:
|
31 |
+
pdf_content = await response.read()
|
32 |
+
|
33 |
+
pdf_file = io.BytesIO(pdf_content)
|
34 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
35 |
+
text = ""
|
36 |
+
for page in pdf_reader.pages:
|
37 |
+
text += page.extract_text()
|
38 |
+
|
39 |
+
markdown_text = f"# {title}\n\n{text}\n\n---\n\n"
|
40 |
+
|
41 |
+
os.makedirs('cache', exist_ok=True)
|
42 |
+
async with aiofiles.open(cache_file, 'w') as f:
|
43 |
+
await f.write(markdown_text)
|
44 |
+
|
45 |
+
return markdown_text
|
46 |
+
except Exception as e:
|
47 |
+
return f"# {title}\n\nError processing PDF: {str(e)}\n\n---\n\n"
|
48 |
|
49 |
+
async def process_papers(data, progress=gr.Progress()):
|
50 |
+
async with aiohttp.ClientSession() as session:
|
51 |
+
tasks = []
|
52 |
+
for title, paper_info in data.items():
|
53 |
+
task = asyncio.ensure_future(download_and_convert_pdf(session, title, paper_info))
|
54 |
+
tasks.append(task)
|
55 |
+
|
56 |
+
consolidated_text = ""
|
57 |
+
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
58 |
+
markdown_text = await task
|
59 |
+
consolidated_text += markdown_text
|
60 |
+
progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
return consolidated_text
|
63 |
|
64 |
+
def download_all_papers(progress=gr.Progress()):
|
65 |
all_data = {}
|
66 |
for category in ["top", "latest", "greatest"]:
|
67 |
cache_file = f"{category}_papers_cache.json"
|
|
|
69 |
if data:
|
70 |
all_data.update(data)
|
71 |
|
72 |
+
consolidated_text = asyncio.run(process_papers(all_data, progress))
|
73 |
|
74 |
with open("consolidated_papers.md", "w", encoding="utf-8") as f:
|
75 |
f.write(consolidated_text)
|
|
|
99 |
|
100 |
download_button = gr.Button("📚 Download All Papers", variant="primary")
|
101 |
download_output = gr.Textbox(label="Download Status")
|
102 |
+
markdown_output = gr.Markdown(label="Paper Content")
|
103 |
+
download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, markdown_output])
|
104 |
|
105 |
# Load initial data for all tabs
|
106 |
demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
|