File size: 2,268 Bytes
3133b5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import logging
import re
from typing import Tuple

import arxiv
import gradio as gr
import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)


def clean_spaces(text: str) -> str:
    # replace all multiple spaces with a single space
    text = re.sub(" +", " ", text)
    # reduce more than two newlines to two newlines
    text = re.sub("\n\n+", "\n\n", text)
    # remove leading and trailing whitespaces
    text = text.strip()
    return text


def get_cleaned_arxiv_paper_text(html_content: str) -> str:
    # parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    # get alerts (this is one div with classes "package-alerts" and "ltx_document")
    alerts = soup.find("div", class_="package-alerts ltx_document")
    # get the "article" html element
    article = soup.find("article")
    article_text = article.get_text()
    # cleanup the text
    article_text_clean = clean_spaces(article_text)
    return article_text_clean


def load_text_from_arxiv(arxiv_id: str, abstract_only: bool = False) -> Tuple[str, str]:

    search_by_id = arxiv.Search(id_list=[arxiv_id])
    try:
        result = list(arxiv.Client().results(search_by_id))
    except arxiv.HTTPError as e:
        raise gr.Error(f"Failed to fetch arXiv data: {e}")
    if len(result) == 0:
        raise gr.Error(f"Could not find any paper with arXiv ID '{arxiv_id}'")
    first_result = result[0]
    if abstract_only:
        abstract_clean = first_result.summary.replace("\n", " ")
        return abstract_clean, first_result.entry_id
    if "/abs/" not in first_result.entry_id:
        raise gr.Error(
            f"Could not create the HTML URL for arXiv ID '{arxiv_id}' because its entry ID has "
            f"an unexpected format: {first_result.entry_id}"
        )
    html_url = first_result.entry_id.replace("/abs/", "/html/")
    request_result = requests.get(html_url)
    if request_result.status_code != 200:
        raise gr.Error(
            f"Could not fetch the HTML content for arXiv ID '{arxiv_id}', status code: "
            f"{request_result.status_code}"
        )
    html_content = request_result.text
    text_clean = get_cleaned_arxiv_paper_text(html_content)
    return text_clean, html_url