ArneBinder's picture
update from https://github.com/ArneBinder/pie-document-level/pull/397
ced4316 verified
import json
import os
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional
import requests
from bs4 import BeautifulSoup
from .grobid_client import GrobidClient
from .grobid_util import extract_paper_metadata_from_grobid_xml, parse_bibliography
from .s2orc_paper import Paper
from .utils import (
_clean_empty_and_duplicate_authors_from_grobid_parse,
check_if_citations_are_bracket_style,
extract_abstract_from_tei_xml,
extract_back_matter_from_tei_xml,
extract_body_text_from_tei_xml,
extract_figures_and_tables_from_tei_xml,
normalize_grobid_id,
sub_all_note_tags,
)
BASE_TEMP_DIR = "./grobid/temp"
BASE_OUTPUT_DIR = "./grobid/output"
BASE_LOG_DIR = "./grobid/log"
def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper:
"""
Convert Grobid TEI XML to S2ORC json format
:param soup: BeautifulSoup of XML file content
:param paper_id: name of file
:param pdf_hash: hash of PDF
:return:
"""
# extract metadata
metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc)
# clean metadata authors (remove dupes etc)
metadata["authors"] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata["authors"])
# parse bibliography entries (removes empty bib entries)
biblio_entries = parse_bibliography(soup)
bibkey_map = {normalize_grobid_id(bib["ref_id"]): bib for bib in biblio_entries}
# # process formulas and replace with text
# extract_formulas_from_tei_xml(soup)
# extract figure and table captions
refkey_map = extract_figures_and_tables_from_tei_xml(soup)
# get bracket style
is_bracket_style = check_if_citations_are_bracket_style(soup)
# substitute all note tags with p tags
soup = sub_all_note_tags(soup)
# process abstract if possible
abstract_entries = extract_abstract_from_tei_xml(
soup, bibkey_map, refkey_map, is_bracket_style
)
# process body text
body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
# parse back matter (acks, author statements, competing interests, abbrevs etc)
back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
# form final paper entry
return Paper(
paper_id=paper_id,
pdf_hash=pdf_hash,
metadata=metadata,
abstract=abstract_entries,
body_text=body_entries,
back_matter=back_matter,
bib_entries=bibkey_map,
ref_entries=refkey_map,
)
def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper:
"""
Convert a TEI XML file to S2ORC JSON
:param tei_file:
:param pdf_hash:
:return:
"""
if not os.path.exists(tei_file):
raise FileNotFoundError("Input TEI XML file doesn't exist")
paper_id = tei_file.split("/")[-1].split(".")[0]
soup = BeautifulSoup(open(tei_file, "rb").read(), "xml")
paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash)
return paper
def process_pdf_stream(
input_file: str, sha: str, input_stream: bytes, grobid_config: Optional[Dict] = None
) -> Dict:
"""
Process PDF stream
:param input_file:
:param sha:
:param input_stream:
:return:
"""
# process PDF through Grobid -> TEI.XML
client = GrobidClient(grobid_config)
tei_text = client.process_pdf_stream(
input_file, input_stream, "temp", "processFulltextDocument"
)
# make soup
soup = BeautifulSoup(tei_text, "xml")
# get paper
paper = convert_tei_xml_soup_to_s2orc_json(soup, input_file, sha)
return paper.release_json("pdf")
def process_pdf_file(
input_file: str,
temp_dir: str = BASE_TEMP_DIR,
output_dir: str = BASE_OUTPUT_DIR,
grobid_config: Optional[Dict] = None,
verbose: bool = True,
) -> str:
"""
Process a PDF file and get JSON representation
:param input_file:
:param temp_dir:
:param output_dir:
:return:
"""
os.makedirs(temp_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
# get paper id as the name of the file
paper_id = ".".join(input_file.split("/")[-1].split(".")[:-1])
tei_file = os.path.join(temp_dir, f"{paper_id}.tei.xml")
output_file = os.path.join(output_dir, f"{paper_id}.json")
# check if input file exists and output file doesn't
if not os.path.exists(input_file):
raise FileNotFoundError(f"{input_file} doesn't exist")
if os.path.exists(output_file):
if verbose:
print(f"{output_file} already exists!")
return output_file
# process PDF through Grobid -> TEI.XML
client = GrobidClient(grobid_config)
# TODO: compute PDF hash
# TODO: add grobid version number to output
client.process_pdf(input_file, temp_dir, "processFulltextDocument")
# process TEI.XML -> JSON
assert os.path.exists(tei_file)
paper = convert_tei_xml_file_to_s2orc_json(tei_file)
# write to file
with open(output_file, "w") as outf:
json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
return output_file
UUID_NAMESPACE = uuid.UUID("bab08d37-ac12-40c4-847a-20ca337742fd")
def paper_url_to_uuid(paper_url: str) -> "uuid.UUID":
return uuid.uuid5(UUID_NAMESPACE, paper_url)
@dataclass
class PDFDownloader:
verbose: bool = True
def download(self, url: str, opath: str | Path) -> Path:
"""Download a pdf file from URL and save locally.
Skip if there is a file at `opath` already.
Parameters
----------
url : str
URL of the target PDF file
opath : str
Path to save downloaded PDF data.
"""
if os.path.exists(opath):
return Path(opath)
if not os.path.exists(os.path.dirname(opath)):
os.makedirs(os.path.dirname(opath), exist_ok=True)
if self.verbose:
print(f"Downloading {url} into {opath}")
with open(opath, "wb") as f:
res = requests.get(url)
f.write(res.content)
return Path(opath)
@dataclass
class FulltextExtractor:
def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None:
"""Extract plain text from a PDf file"""
raise NotImplementedError
@dataclass
class GrobidFulltextExtractor(FulltextExtractor):
tmp_dir: str = "./tmp/grobid"
grobid_config: Optional[Dict] = None
section_seperator: str = "\n\n"
paragraph_seperator: str = "\n"
verbose: bool = True
def construct_plain_text(self, extraction_result: dict) -> str:
section_strings = []
# add the title, if available (consider it as the first section)
title = extraction_result.get("title")
if title and title.strip():
section_strings.append(title.strip())
section_paragraphs: dict[str, list[str]] = extraction_result["sections"]
section_strings.extend(
self.paragraph_seperator.join(
# consider the section title as the first paragraph and
# remove empty paragraphs
filter(lambda s: len(s) > 0, map(lambda s: s.strip(), [section_name] + paragraphs))
)
for section_name, paragraphs in section_paragraphs.items()
)
return self.section_seperator.join(section_strings)
def postprocess_extraction_result(self, extraction_result: dict) -> dict:
# add sections
sections: dict[str, list[str]] = {}
for body_text in extraction_result["pdf_parse"]["body_text"]:
section_name = body_text["section"]
if section_name not in sections.keys():
sections[section_name] = []
sections[section_name] += [body_text["text"]]
extraction_result = {**extraction_result, "sections": sections}
return extraction_result
def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None:
"""Extract plain text from a PDf file"""
try:
extraction_fpath = process_pdf_file(
str(pdf_file_path),
temp_dir=self.tmp_dir,
output_dir=self.tmp_dir,
grobid_config=self.grobid_config,
verbose=self.verbose,
)
with open(extraction_fpath, "r") as f:
extraction_result = json.load(f)
processed_extraction_result = self.postprocess_extraction_result(extraction_result)
plain_text = self.construct_plain_text(processed_extraction_result)
return plain_text, extraction_result
except AssertionError:
print("Grobid failed to parse this document.")
return None