Spaces:

Yozhikoff
/

paper-topic-classification

Runtime error

Yozhikoff commited on Apr 15, 2023

Commit

84837a0

1 Parent(s): e993597

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,32 +10,33 @@ from transformers import pipeline
 classifier = pipeline(model="Yozhikoff/arxiv-topics-distilbert-base-cased")
 def get_arxiv_title_and_abstract(link):
-    # Regular expression pattern for arXiv link validation
     try:
-        pattern = r'^https?://arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})(?:\.pdf)?/?$'
         match = re.match(pattern, link)
         if not match:
-            raise ValueError("Invalid arXiv link")
-        # Construct the arXiv API URL for the paper
-        arxiv_id = match.group(1)
-        api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
-        # Retrieve the paper metadata using the arXiv API
-        with urllib.request.urlopen(api_url) as response:
-            xml_data = response.read().decode()
-        # Extract the title and abstract from the XML data
-        title = re.search(r'<title>(.*?)</title>', xml_data).group(1)
-        abstract = re.search(r'<summary>(.*?)</summary>', xml_data, re.DOTALL).group(1)
-        # Clean up the title and abstract
-        title = re.sub(r'\s+', ' ', title).strip()
-        abstract = re.sub(r'\s+', ' ', abstract).strip()
-        return title, abstract
     except:
         raise gr.Error('Invalid arXiv URL!')

 classifier = pipeline(model="Yozhikoff/arxiv-topics-distilbert-base-cased")
+import re
+import urllib.request
+import xml.etree.ElementTree as ET
 def get_arxiv_title_and_abstract(link):
     try:
+        # Validate the arxiv link
+        pattern = r'^https?://arxiv.org/(abs|pdf)/(\d{4}\.\d{4,5})(\.pdf)?$'
         match = re.match(pattern, link)
         if not match:
+            raise ValueError('Invalid arxiv link')
+        # Construct the arxiv API URL
+        arxiv_id = match.group(2)
+        api_url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'
+        # Send a request to the arxiv API
+        response = urllib.request.urlopen(api_url)
+        xml_data = response.read()
+        # Parse the XML data
+        root = ET.fromstring(xml_data)
+        entry = root.find('{http://www.w3.org/2005/Atom}entry')
+        title = entry.find('{http://www.w3.org/2005/Atom}title').text
+        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
+        return title, summary
     except:
         raise gr.Error('Invalid arXiv URL!')