import logging import re from typing import Tuple import arxiv import gradio as gr import requests from bs4 import BeautifulSoup logger = logging.getLogger(__name__) def clean_spaces(text: str) -> str: # replace all multiple spaces with a single space text = re.sub(" +", " ", text) # reduce more than two newlines to two newlines text = re.sub("\n\n+", "\n\n", text) # remove leading and trailing whitespaces text = text.strip() return text def get_cleaned_arxiv_paper_text(html_content: str) -> str: # parse the HTML content with BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") # get alerts (this is one div with classes "package-alerts" and "ltx_document") alerts = soup.find("div", class_="package-alerts ltx_document") # get the "article" html element article = soup.find("article") article_text = article.get_text() # cleanup the text article_text_clean = clean_spaces(article_text) return article_text_clean def load_text_from_arxiv(arxiv_id: str, abstract_only: bool = False) -> Tuple[str, str]: search_by_id = arxiv.Search(id_list=[arxiv_id]) try: result = list(arxiv.Client().results(search_by_id)) except arxiv.HTTPError as e: raise gr.Error(f"Failed to fetch arXiv data: {e}") if len(result) == 0: raise gr.Error(f"Could not find any paper with arXiv ID '{arxiv_id}'") first_result = result[0] if abstract_only: abstract_clean = first_result.summary.replace("\n", " ") return abstract_clean, first_result.entry_id if "/abs/" not in first_result.entry_id: raise gr.Error( f"Could not create the HTML URL for arXiv ID '{arxiv_id}' because its entry ID has " f"an unexpected format: {first_result.entry_id}" ) html_url = first_result.entry_id.replace("/abs/", "/html/") request_result = requests.get(html_url) if request_result.status_code != 200: raise gr.Error( f"Could not fetch the HTML content for arXiv ID '{arxiv_id}', status code: " f"{request_result.status_code}" ) html_content = request_result.text text_clean = get_cleaned_arxiv_paper_text(html_content) return text_clean, html_url