ArneBinder's picture
update from https://github.com/ArneBinder/pie-document-level/pull/397
ced4316 verified
import re
from collections import defaultdict
from typing import Dict, List, Optional, Union
import bs4
from bs4 import BeautifulSoup
SUBSTITUTE_TAGS = {"persName", "orgName", "publicationStmt", "titleStmt", "biblScope"}
def clean_tags(el: bs4.element.Tag):
"""
Replace all tags with lowercase version
:param el:
:return:
"""
for sub_tag in SUBSTITUTE_TAGS:
for sub_el in el.find_all(sub_tag):
sub_el.name = sub_tag.lower()
def soup_from_path(file_path: str):
"""
Read XML file
:param file_path:
:return:
"""
return BeautifulSoup(open(file_path, "rb").read(), "xml")
def get_title_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns title
:return:
"""
for title_entry in raw_xml.find_all("title"):
if title_entry.has_attr("level") and title_entry["level"] == "a":
return title_entry.text
try:
return raw_xml.title.text
except AttributeError:
return ""
def get_author_names_from_grobid_xml(
raw_xml: BeautifulSoup,
) -> List[Dict[str, Union[str, List[str]]]]:
"""
Returns a list of dictionaries, one for each author,
containing the first and last names.
e.g.
{
"first": first,
"middle": middle,
"last": last,
"suffix": suffix
}
"""
names = []
for author in raw_xml.find_all("author"):
if not author.persname:
continue
# forenames include first and middle names
forenames = author.persname.find_all("forename")
# surnames include last names
surnames = author.persname.find_all("surname")
# name suffixes
suffixes = author.persname.find_all("suffix")
first = ""
middle = []
last = ""
suffix = ""
for forename in forenames:
if forename["type"] == "first":
if not first:
first = forename.text
else:
middle.append(forename.text)
elif forename["type"] == "middle":
middle.append(forename.text)
if len(surnames) > 1:
for surname in surnames[:-1]:
middle.append(surname.text)
last = surnames[-1].text
elif len(surnames) == 1:
last = surnames[0].text
if len(suffix) >= 1:
suffix = " ".join([suff.text for suff in suffixes])
names_dict: Dict[str, Union[str, List[str]]] = {
"first": first,
"middle": middle,
"last": last,
"suffix": suffix,
}
names.append(names_dict)
return names
def get_affiliation_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict:
"""
Get affiliation from grobid xml
:param raw_xml:
:return:
"""
location_dict = dict()
laboratory_name = ""
institution_name = ""
if raw_xml and raw_xml.affiliation:
for child in raw_xml.affiliation:
if child.name == "orgname":
if child.has_attr("type"):
if child["type"] == "laboratory":
laboratory_name = child.text
elif child["type"] == "institution":
institution_name = child.text
elif child.name == "address":
for grandchild in child:
if grandchild.name and grandchild.text:
location_dict[grandchild.name] = grandchild.text
if laboratory_name or institution_name:
return {
"laboratory": laboratory_name,
"institution": institution_name,
"location": location_dict,
}
return {}
def get_author_data_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict]:
"""
Returns a list of dictionaries, one for each author,
containing the first and last names.
e.g.
{
"first": first,
"middle": middle,
"last": last,
"suffix": suffix,
"affiliation": {
"laboratory": "",
"institution": "",
"location": "",
},
"email": ""
}
"""
authors = []
for author in raw_xml.find_all("author"):
first = ""
middle = []
last = ""
suffix = ""
if author.persname:
# forenames include first and middle names
forenames = author.persname.find_all("forename")
# surnames include last names
surnames = author.persname.find_all("surname")
# name suffixes
suffixes = author.persname.find_all("suffix")
for forename in forenames:
if forename.has_attr("type"):
if forename["type"] == "first":
if not first:
first = forename.text
else:
middle.append(forename.text)
elif forename["type"] == "middle":
middle.append(forename.text)
if len(surnames) > 1:
for surname in surnames[:-1]:
middle.append(surname.text)
last = surnames[-1].text
elif len(surnames) == 1:
last = surnames[0].text
if len(suffix) >= 1:
suffix = " ".join([suffix.text for suffix in suffixes])
affiliation = get_affiliation_from_grobid_xml(author)
email = ""
if author.email:
email = author.email.text
author_dict = {
"first": first,
"middle": middle,
"last": last,
"suffix": suffix,
"affiliation": affiliation,
"email": email,
}
authors.append(author_dict)
return authors
def get_year_from_grobid_xml(raw_xml: BeautifulSoup) -> Optional[int]:
"""
Returns date published if exists
:return:
"""
if raw_xml.date and raw_xml.date.has_attr("when"):
# match year in date text (which is in some unspecified date format)
year_match = re.match(r"((19|20)\d{2})", raw_xml.date["when"])
if year_match:
year = year_match.group(0)
if year and year.isnumeric() and len(year) == 4:
return int(year)
return None
def get_venue_from_grobid_xml(raw_xml: BeautifulSoup, title_text: str) -> str:
"""
Returns venue/journal/publisher of bib entry
Grobid ref documentation: https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/
level="j": journal title
level="m": "non journal bibliographical item holding the cited article"
level="s": series title
:return:
"""
title_names = []
keep_types = ["j", "m", "s"]
# get all titles of the anove types
for title_entry in raw_xml.find_all("title"):
if (
title_entry.has_attr("level")
and title_entry["level"] in keep_types
and title_entry.text != title_text
):
title_names.append((title_entry["level"], title_entry.text))
# return the title name that most likely belongs to the journal or publication venue
if title_names:
title_names.sort(key=lambda x: keep_types.index(x[0]))
return title_names[0][1]
return ""
def get_volume_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns the volume number of grobid bib entry
Grobid <biblscope unit="volume">
:return:
"""
for bibl_entry in raw_xml.find_all("biblscope"):
if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "volume":
return bibl_entry.text
return ""
def get_issue_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns the issue number of grobid bib entry
Grobid <biblscope unit="issue">
:return:
"""
for bibl_entry in raw_xml.find_all("biblscope"):
if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "issue":
return bibl_entry.text
return ""
def get_pages_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns the page numbers of grobid bib entry
Grobid <biblscope unit="page">
:return:
"""
for bibl_entry in raw_xml.find_all("biblscope"):
if (
bibl_entry.has_attr("unit")
and bibl_entry["unit"] == "page"
and bibl_entry.has_attr("from")
):
from_page = bibl_entry["from"]
if bibl_entry.has_attr("to"):
to_page = bibl_entry["to"]
return f"{from_page}--{to_page}"
else:
return from_page
return ""
def get_other_ids_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict[str, List]:
"""
Returns a dictionary of other identifiers from grobid bib entry (arxiv, pubmed, doi)
:param raw_xml:
:return:
"""
other_ids = defaultdict(list)
for idno_entry in raw_xml.find_all("idno"):
if idno_entry.has_attr("type") and idno_entry.text:
other_ids[idno_entry["type"]].append(idno_entry.text)
return other_ids
def get_raw_bib_text_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Returns the raw bibiliography string
:param raw_xml:
:return:
"""
for note in raw_xml.find_all("note"):
if note.has_attr("type") and note["type"] == "raw_reference":
return note.text
return ""
def get_publication_datetime_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
"""
Finds and returns the publication datetime if it exists
:param raw_xml:
:return:
"""
if raw_xml.publicationStmt:
for child in raw_xml.publicationstmt:
if (
child.name == "date"
and child.has_attr("type")
and child["type"] == "published"
and child.has_attr("when")
):
return child["when"]
return ""
def parse_bib_entry(bib_entry: BeautifulSoup) -> Dict:
"""
Parse one bib entry
:param bib_entry:
:return:
"""
clean_tags(bib_entry)
title = get_title_from_grobid_xml(bib_entry)
return {
"ref_id": bib_entry.attrs.get("xml:id", None),
"title": title,
"authors": get_author_names_from_grobid_xml(bib_entry),
"year": get_year_from_grobid_xml(bib_entry),
"venue": get_venue_from_grobid_xml(bib_entry, title),
"volume": get_volume_from_grobid_xml(bib_entry),
"issue": get_issue_from_grobid_xml(bib_entry),
"pages": get_pages_from_grobid_xml(bib_entry),
"other_ids": get_other_ids_from_grobid_xml(bib_entry),
"raw_text": get_raw_bib_text_from_grobid_xml(bib_entry),
"urls": [],
}
def is_reference_tag(tag: bs4.element.Tag) -> bool:
return tag.name == "ref" and tag.attrs.get("type", "") == "bibr"
def extract_paper_metadata_from_grobid_xml(tag: bs4.element.Tag) -> Dict:
"""
Extract paper metadata (title, authors, affiliation, year) from grobid xml
:param tag:
:return:
"""
clean_tags(tag)
paper_metadata = {
"title": tag.titlestmt.title.text,
"authors": get_author_data_from_grobid_xml(tag),
"year": get_publication_datetime_from_grobid_xml(tag),
}
return paper_metadata
def parse_bibliography(soup: BeautifulSoup) -> List[Dict]:
"""
Finds all bibliography entries in a grobid xml.
"""
bibliography = soup.listBibl
if bibliography is None:
return []
entries = bibliography.find_all("biblStruct")
structured_entries = []
for entry in entries:
bib_entry = parse_bib_entry(entry)
# add bib entry only if it has a title
if bib_entry["title"]:
structured_entries.append(bib_entry)
bibliography.decompose()
return structured_entries