Yozhikoff commited on
Commit
84837a0
Β·
1 Parent(s): e993597

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -22
app.py CHANGED
@@ -10,32 +10,33 @@ from transformers import pipeline
10
  classifier = pipeline(model="Yozhikoff/arxiv-topics-distilbert-base-cased")
11
 
12
 
 
 
 
 
13
  def get_arxiv_title_and_abstract(link):
14
- # Regular expression pattern for arXiv link validation
15
  try:
16
- pattern = r'^https?://arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})(?:\.pdf)?/?$'
 
17
  match = re.match(pattern, link)
18
-
19
  if not match:
20
- raise ValueError("Invalid arXiv link")
21
-
22
- # Construct the arXiv API URL for the paper
23
- arxiv_id = match.group(1)
24
- api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
25
-
26
- # Retrieve the paper metadata using the arXiv API
27
- with urllib.request.urlopen(api_url) as response:
28
- xml_data = response.read().decode()
29
-
30
- # Extract the title and abstract from the XML data
31
- title = re.search(r'<title>(.*?)</title>', xml_data).group(1)
32
- abstract = re.search(r'<summary>(.*?)</summary>', xml_data, re.DOTALL).group(1)
33
-
34
- # Clean up the title and abstract
35
- title = re.sub(r'\s+', ' ', title).strip()
36
- abstract = re.sub(r'\s+', ' ', abstract).strip()
37
-
38
- return title, abstract
39
  except:
40
  raise gr.Error('Invalid arXiv URL!')
41
 
 
10
  classifier = pipeline(model="Yozhikoff/arxiv-topics-distilbert-base-cased")
11
 
12
 
13
+ import re
14
+ import urllib.request
15
+ import xml.etree.ElementTree as ET
16
+
17
  def get_arxiv_title_and_abstract(link):
 
18
  try:
19
+ # Validate the arxiv link
20
+ pattern = r'^https?://arxiv.org/(abs|pdf)/(\d{4}\.\d{4,5})(\.pdf)?$'
21
  match = re.match(pattern, link)
 
22
  if not match:
23
+ raise ValueError('Invalid arxiv link')
24
+
25
+ # Construct the arxiv API URL
26
+ arxiv_id = match.group(2)
27
+ api_url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'
28
+
29
+ # Send a request to the arxiv API
30
+ response = urllib.request.urlopen(api_url)
31
+ xml_data = response.read()
32
+
33
+ # Parse the XML data
34
+ root = ET.fromstring(xml_data)
35
+ entry = root.find('{http://www.w3.org/2005/Atom}entry')
36
+ title = entry.find('{http://www.w3.org/2005/Atom}title').text
37
+ summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
38
+
39
+ return title, summary
 
 
40
  except:
41
  raise gr.Error('Invalid arXiv URL!')
42