File size: 1,361 Bytes
fe5e03c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from src.backend.data_fetching.data_fields import DataFields


def fetch_from_pmcid(pmcid: str):
    import requests
    from lxml import etree
    import re

    base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/{}/fullTextXML"
    resp = requests.get(base_url.format(pmcid))
    article_xml: etree._Element = etree.fromstring(resp.content)
    article_id = article_xml.find(".//article-id[@pub-id-type='pmcid']").text
    article_id = f"PMC{article_id}"

    # article_title = article_xml.find(".//article-title").text
    article_abstract = ''.join(list(article_xml.find(".//abstract/*").itertext()))

    section_elements = [section for section in article_xml.findall(".//sec")]

    section_names = ["Abstract"] + [section.find(".//title").text for section in section_elements]

    sections = [
        '\n'.join(
            [''.join([i for i in [j.text, j.tail] if i]) for j in section.xpath(".//*[name() != 'title']")]) for
        section in section_elements]

    sections = [re.subn("\\[[^\\]]*\\]", "", i, flags=re.DOTALL)[0].split('\n') for i in sections]
    sections = [article_abstract] + ['.'.join(i) for i in sections]

    output = {
        DataFields.ARTICLE_ID: article_id,
        # "article_title": article_title,
        DataFields.SECTION_NAMES: section_names,
        DataFields.SECTIONS: sections
    }
    return output