awacke1 commited on
Commit
f00ae4c
·
verified ·
1 Parent(s): 5be96e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -108
app.py CHANGED
@@ -7,121 +7,61 @@ import os
7
  import json
8
  import PyPDF2
9
  import io
 
 
 
 
 
10
 
11
- def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
12
- base_url = "https://paperswithcode.com"
13
- session = requests.Session()
14
- headers = {
15
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
16
- 'Cache-Control': 'no-cache'
17
- }
18
- print("Time run at : ", time.ctime())
19
- offset = 0
20
- data_list = {}
21
- break_duplicate = 10
22
-
23
- while True:
24
- response = session.get(url, headers=headers, params={'page': offset})
25
- if response.status_code != 200:
26
- print('Failed to retrieve data')
27
- break
28
- soup = BeautifulSoup(response.text, 'html.parser')
29
- paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
30
- if not paper_info:
31
- break
32
- for ppr in paper_info:
33
- title = ppr.find('h1').text.strip()
34
-
35
- if "paper" in ppr.find('a')['href']:
36
- link = base_url + ppr.find('a')['href']
37
- else:
38
- link = ppr.find('a')['href']
39
- Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
40
- pdf_link = ''
41
- try:
42
- response_link = session.get(link, headers=headers)
43
- soup_link = BeautifulSoup(response_link.text, 'html.parser')
44
- paper_info_link = soup_link.find_all('div', class_='paper-abstract')
45
- pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
46
- except:
47
- pass
48
- if title not in data_list:
49
- data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
50
- else:
51
- break_duplicate -= 1
52
- if break_duplicate == 0:
53
- return data_list
54
- offset += 1
55
- progress.update(offset)
56
- print('Data retrieval complete')
57
- return data_list
58
-
59
- def load_cached_data(cache_file):
60
- if os.path.exists(cache_file):
61
- with open(cache_file, 'r') as f:
62
- return json.load(f)
63
- return None
64
-
65
- def save_cached_data(data, cache_file):
66
- with open(cache_file, 'w') as f:
67
- json.dump(data, f)
68
-
69
- def format_dataframe(data):
70
- df = pd.DataFrame(data).T
71
- df['title'] = df.index
72
- df = df[['title', 'Github Star', 'link', 'pdf_link']]
73
- df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
74
- df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
75
- return df
76
 
77
- def load_and_cache_data(url, cache_file):
78
- cached_data = load_cached_data(cache_file)
 
79
 
80
- if cached_data:
81
- print(f"Loading cached data from {cache_file}")
82
- return cached_data
83
-
84
- print(f"Fetching new data from {url}")
85
- new_data = get_rank_papers(url)
86
- save_cached_data(new_data, cache_file)
87
- return new_data
88
-
89
- def update_display(category):
90
- cache_file = f"{category}_papers_cache.json"
91
- url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
92
 
93
- data = load_and_cache_data(url, cache_file)
94
- df = format_dataframe(data)
95
 
96
- return len(df), df.to_html(escape=False, index=False)
97
-
98
- def load_all_data():
99
- top_count, top_html = update_display("top")
100
- new_count, new_html = update_display("latest")
101
- greatest_count, greatest_html = update_display("greatest")
102
- return top_count, top_html, new_count, new_html, greatest_count, greatest_html
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- def download_and_convert_pdfs(data):
105
- consolidated_text = ""
106
- for title, paper_info in data.items():
107
- pdf_url = paper_info['pdf_link']
108
- if pdf_url:
109
- try:
110
- response = requests.get(pdf_url)
111
- pdf_file = io.BytesIO(response.content)
112
- pdf_reader = PyPDF2.PdfReader(pdf_file)
113
- text = ""
114
- for page in pdf_reader.pages:
115
- text += page.extract_text()
116
-
117
- markdown_text = f"# {title}\n\n{text}\n\n---\n\n"
118
- consolidated_text += markdown_text
119
- except Exception as e:
120
- print(f"Error processing PDF for {title}: {str(e)}")
121
 
122
  return consolidated_text
123
 
124
- def download_all_papers():
125
  all_data = {}
126
  for category in ["top", "latest", "greatest"]:
127
  cache_file = f"{category}_papers_cache.json"
@@ -129,7 +69,7 @@ def download_all_papers():
129
  if data:
130
  all_data.update(data)
131
 
132
- consolidated_text = download_and_convert_pdfs(all_data)
133
 
134
  with open("consolidated_papers.md", "w", encoding="utf-8") as f:
135
  f.write(consolidated_text)
@@ -159,7 +99,8 @@ with gr.Blocks() as demo:
159
 
160
  download_button = gr.Button("📚 Download All Papers", variant="primary")
161
  download_output = gr.Textbox(label="Download Status")
162
- download_button.click(fn=download_all_papers, inputs=None, outputs=download_output)
 
163
 
164
  # Load initial data for all tabs
165
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])
 
7
  import json
8
  import PyPDF2
9
  import io
10
+ import markdown
11
+ import asyncio
12
+ import aiohttp
13
+ import aiofiles
14
+ from concurrent.futures import ThreadPoolExecutor
15
 
16
+ # ... (keep the existing functions like get_rank_papers, load_cached_data, save_cached_data, format_dataframe, load_and_cache_data, update_display, load_all_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ async def download_and_convert_pdf(session, title, paper_info):
19
+ pdf_url = paper_info['pdf_link']
20
+ cache_file = f"cache/{title.replace(' ', '_')}.md"
21
 
22
+ if os.path.exists(cache_file):
23
+ async with aiofiles.open(cache_file, 'r') as f:
24
+ return await f.read()
 
 
 
 
 
 
 
 
 
25
 
26
+ if not pdf_url:
27
+ return f"# {title}\n\nNo PDF link available.\n\n---\n\n"
28
 
29
+ try:
30
+ async with session.get(pdf_url) as response:
31
+ pdf_content = await response.read()
32
+
33
+ pdf_file = io.BytesIO(pdf_content)
34
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
35
+ text = ""
36
+ for page in pdf_reader.pages:
37
+ text += page.extract_text()
38
+
39
+ markdown_text = f"# {title}\n\n{text}\n\n---\n\n"
40
+
41
+ os.makedirs('cache', exist_ok=True)
42
+ async with aiofiles.open(cache_file, 'w') as f:
43
+ await f.write(markdown_text)
44
+
45
+ return markdown_text
46
+ except Exception as e:
47
+ return f"# {title}\n\nError processing PDF: {str(e)}\n\n---\n\n"
48
 
49
+ async def process_papers(data, progress=gr.Progress()):
50
+ async with aiohttp.ClientSession() as session:
51
+ tasks = []
52
+ for title, paper_info in data.items():
53
+ task = asyncio.ensure_future(download_and_convert_pdf(session, title, paper_info))
54
+ tasks.append(task)
55
+
56
+ consolidated_text = ""
57
+ for i, task in enumerate(asyncio.as_completed(tasks), start=1):
58
+ markdown_text = await task
59
+ consolidated_text += markdown_text
60
+ progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
 
 
 
 
 
61
 
62
  return consolidated_text
63
 
64
+ def download_all_papers(progress=gr.Progress()):
65
  all_data = {}
66
  for category in ["top", "latest", "greatest"]:
67
  cache_file = f"{category}_papers_cache.json"
 
69
  if data:
70
  all_data.update(data)
71
 
72
+ consolidated_text = asyncio.run(process_papers(all_data, progress))
73
 
74
  with open("consolidated_papers.md", "w", encoding="utf-8") as f:
75
  f.write(consolidated_text)
 
99
 
100
  download_button = gr.Button("📚 Download All Papers", variant="primary")
101
  download_output = gr.Textbox(label="Download Status")
102
+ markdown_output = gr.Markdown(label="Paper Content")
103
+ download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, markdown_output])
104
 
105
  # Load initial data for all tabs
106
  demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])