PaperPulse

Sleeping

App Files Files Community

awacke1 commited on Sep 25, 2024

Commit

f00ae4c

verified ·

1 Parent(s): 5be96e5

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -108

app.py CHANGED Viewed

@@ -7,121 +7,61 @@ import os
 import json
 import PyPDF2
 import io
-def get_rank_papers(url, progress=gr.Progress(track_tqdm=True)):
-    base_url = "https://paperswithcode.com"
-    session = requests.Session()
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
-        'Cache-Control': 'no-cache'
-    }
-    print("Time run at : ", time.ctime())
-    offset = 0
-    data_list = {}
-    break_duplicate = 10
-    while True:
-        response = session.get(url, headers=headers, params={'page': offset})
-        if response.status_code != 200:
-            print('Failed to retrieve data')
-            break
-        soup = BeautifulSoup(response.text, 'html.parser')
-        paper_info = soup.find_all('div', class_='row infinite-item item paper-card')
-        if not paper_info:
-            break
-        for ppr in paper_info:
-            title = ppr.find('h1').text.strip()
-            if "paper" in ppr.find('a')['href']:
-                link = base_url + ppr.find('a')['href']
-            else:
-                link = ppr.find('a')['href']
-            Github_Star = ppr.find('span', class_='badge badge-secondary').text.strip().replace(',', '')
-            pdf_link = ''
-            try:
-                response_link = session.get(link, headers=headers)
-                soup_link = BeautifulSoup(response_link.text, 'html.parser')
-                paper_info_link = soup_link.find_all('div', class_='paper-abstract')
-                pdf_link = paper_info_link[0].find('div', class_='col-md-12').find('a')['href']
-            except:
-                pass
-            if title not in data_list:
-                data_list[title] = {'link': link, 'Github Star': int(Github_Star), 'pdf_link': pdf_link.strip()}
-            else:
-                break_duplicate -= 1
-                if break_duplicate == 0:
-                    return data_list
-        offset += 1
-        progress.update(offset)
-    print('Data retrieval complete')
-    return data_list
-def load_cached_data(cache_file):
-    if os.path.exists(cache_file):
-        with open(cache_file, 'r') as f:
-            return json.load(f)
-    return None
-def save_cached_data(data, cache_file):
-    with open(cache_file, 'w') as f:
-        json.dump(data, f)
-def format_dataframe(data):
-    df = pd.DataFrame(data).T
-    df['title'] = df.index
-    df = df[['title', 'Github Star', 'link', 'pdf_link']]
-    df['link'] = df['link'].apply(lambda x: f'<a href="{x}" target="_blank">Link</a>')
-    df['pdf_link'] = df['pdf_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
-    return df
-def load_and_cache_data(url, cache_file):
-    cached_data = load_cached_data(cache_file)
-    if cached_data:
-        print(f"Loading cached data from {cache_file}")
-        return cached_data
-    print(f"Fetching new data from {url}")
-    new_data = get_rank_papers(url)
-    save_cached_data(new_data, cache_file)
-    return new_data
-def update_display(category):
-    cache_file = f"{category}_papers_cache.json"
-    url = f"https://paperswithcode.com/{category}" if category != "top" else "https://paperswithcode.com/"
-    data = load_and_cache_data(url, cache_file)
-    df = format_dataframe(data)
-    return len(df), df.to_html(escape=False, index=False)
-def load_all_data():
-    top_count, top_html = update_display("top")
-    new_count, new_html = update_display("latest")
-    greatest_count, greatest_html = update_display("greatest")
-    return top_count, top_html, new_count, new_html, greatest_count, greatest_html
-def download_and_convert_pdfs(data):
-    consolidated_text = ""
-    for title, paper_info in data.items():
-        pdf_url = paper_info['pdf_link']
-        if pdf_url:
-            try:
-                response = requests.get(pdf_url)
-                pdf_file = io.BytesIO(response.content)
-                pdf_reader = PyPDF2.PdfReader(pdf_file)
-                text = ""
-                for page in pdf_reader.pages:
-                    text += page.extract_text()
-                markdown_text = f"# {title}\n\n{text}\n\n---\n\n"
-                consolidated_text += markdown_text
-            except Exception as e:
-                print(f"Error processing PDF for {title}: {str(e)}")
     return consolidated_text
-def download_all_papers():
     all_data = {}
     for category in ["top", "latest", "greatest"]:
         cache_file = f"{category}_papers_cache.json"
@@ -129,7 +69,7 @@ def download_all_papers():
         if data:
             all_data.update(data)
-    consolidated_text = download_and_convert_pdfs(all_data)
     with open("consolidated_papers.md", "w", encoding="utf-8") as f:
         f.write(consolidated_text)
@@ -159,7 +99,8 @@ with gr.Blocks() as demo:
     download_button = gr.Button("📚 Download All Papers", variant="primary")
     download_output = gr.Textbox(label="Download Status")
-    download_button.click(fn=download_all_papers, inputs=None, outputs=download_output)
     # Load initial data for all tabs
     demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])

 import json
 import PyPDF2
 import io
+import markdown
+import asyncio
+import aiohttp
+import aiofiles
+from concurrent.futures import ThreadPoolExecutor
+# ... (keep the existing functions like get_rank_papers, load_cached_data, save_cached_data, format_dataframe, load_and_cache_data, update_display, load_all_data)
+async def download_and_convert_pdf(session, title, paper_info):
+    pdf_url = paper_info['pdf_link']
+    cache_file = f"cache/{title.replace(' ', '_')}.md"
+    if os.path.exists(cache_file):
+        async with aiofiles.open(cache_file, 'r') as f:
+            return await f.read()
+    if not pdf_url:
+        return f"# {title}\n\nNo PDF link available.\n\n---\n\n"
+    try:
+        async with session.get(pdf_url) as response:
+            pdf_content = await response.read()
+        pdf_file = io.BytesIO(pdf_content)
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+        markdown_text = f"# {title}\n\n{text}\n\n---\n\n"
+        os.makedirs('cache', exist_ok=True)
+        async with aiofiles.open(cache_file, 'w') as f:
+            await f.write(markdown_text)
+        return markdown_text
+    except Exception as e:
+        return f"# {title}\n\nError processing PDF: {str(e)}\n\n---\n\n"
+async def process_papers(data, progress=gr.Progress()):
+    async with aiohttp.ClientSession() as session:
+        tasks = []
+        for title, paper_info in data.items():
+            task = asyncio.ensure_future(download_and_convert_pdf(session, title, paper_info))
+            tasks.append(task)
+        consolidated_text = ""
+        for i, task in enumerate(asyncio.as_completed(tasks), start=1):
+            markdown_text = await task
+            consolidated_text += markdown_text
+            progress(i / len(tasks), f"Processed {i}/{len(tasks)} papers")
     return consolidated_text
+def download_all_papers(progress=gr.Progress()):
     all_data = {}
     for category in ["top", "latest", "greatest"]:
         cache_file = f"{category}_papers_cache.json"
         if data:
             all_data.update(data)
+    consolidated_text = asyncio.run(process_papers(all_data, progress))
     with open("consolidated_papers.md", "w", encoding="utf-8") as f:
         f.write(consolidated_text)
     download_button = gr.Button("📚 Download All Papers", variant="primary")
     download_output = gr.Textbox(label="Download Status")
+    markdown_output = gr.Markdown(label="Paper Content")
+    download_button.click(fn=download_all_papers, inputs=None, outputs=[download_output, markdown_output])
     # Load initial data for all tabs
     demo.load(fn=load_all_data, outputs=[top_count, top_html, new_count, new_html, greatest_count, greatest_html])