|
import json |
|
import os |
|
import uuid |
|
from dataclasses import dataclass |
|
from pathlib import Path |
|
from typing import Dict, Optional |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
from .grobid_client import GrobidClient |
|
from .grobid_util import extract_paper_metadata_from_grobid_xml, parse_bibliography |
|
from .s2orc_paper import Paper |
|
from .utils import ( |
|
_clean_empty_and_duplicate_authors_from_grobid_parse, |
|
check_if_citations_are_bracket_style, |
|
extract_abstract_from_tei_xml, |
|
extract_back_matter_from_tei_xml, |
|
extract_body_text_from_tei_xml, |
|
extract_figures_and_tables_from_tei_xml, |
|
normalize_grobid_id, |
|
sub_all_note_tags, |
|
) |
|
|
|
BASE_TEMP_DIR = "./grobid/temp" |
|
BASE_OUTPUT_DIR = "./grobid/output" |
|
BASE_LOG_DIR = "./grobid/log" |
|
|
|
|
|
def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper: |
|
""" |
|
Convert Grobid TEI XML to S2ORC json format |
|
:param soup: BeautifulSoup of XML file content |
|
:param paper_id: name of file |
|
:param pdf_hash: hash of PDF |
|
:return: |
|
""" |
|
|
|
metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc) |
|
|
|
metadata["authors"] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata["authors"]) |
|
|
|
|
|
biblio_entries = parse_bibliography(soup) |
|
bibkey_map = {normalize_grobid_id(bib["ref_id"]): bib for bib in biblio_entries} |
|
|
|
|
|
|
|
|
|
|
|
refkey_map = extract_figures_and_tables_from_tei_xml(soup) |
|
|
|
|
|
is_bracket_style = check_if_citations_are_bracket_style(soup) |
|
|
|
|
|
soup = sub_all_note_tags(soup) |
|
|
|
|
|
abstract_entries = extract_abstract_from_tei_xml( |
|
soup, bibkey_map, refkey_map, is_bracket_style |
|
) |
|
|
|
|
|
body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) |
|
|
|
|
|
back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) |
|
|
|
|
|
return Paper( |
|
paper_id=paper_id, |
|
pdf_hash=pdf_hash, |
|
metadata=metadata, |
|
abstract=abstract_entries, |
|
body_text=body_entries, |
|
back_matter=back_matter, |
|
bib_entries=bibkey_map, |
|
ref_entries=refkey_map, |
|
) |
|
|
|
|
|
def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper: |
|
""" |
|
Convert a TEI XML file to S2ORC JSON |
|
:param tei_file: |
|
:param pdf_hash: |
|
:return: |
|
""" |
|
if not os.path.exists(tei_file): |
|
raise FileNotFoundError("Input TEI XML file doesn't exist") |
|
paper_id = tei_file.split("/")[-1].split(".")[0] |
|
soup = BeautifulSoup(open(tei_file, "rb").read(), "xml") |
|
paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash) |
|
return paper |
|
|
|
|
|
def process_pdf_stream( |
|
input_file: str, sha: str, input_stream: bytes, grobid_config: Optional[Dict] = None |
|
) -> Dict: |
|
""" |
|
Process PDF stream |
|
:param input_file: |
|
:param sha: |
|
:param input_stream: |
|
:return: |
|
""" |
|
|
|
client = GrobidClient(grobid_config) |
|
tei_text = client.process_pdf_stream( |
|
input_file, input_stream, "temp", "processFulltextDocument" |
|
) |
|
|
|
|
|
soup = BeautifulSoup(tei_text, "xml") |
|
|
|
|
|
paper = convert_tei_xml_soup_to_s2orc_json(soup, input_file, sha) |
|
|
|
return paper.release_json("pdf") |
|
|
|
|
|
def process_pdf_file( |
|
input_file: str, |
|
temp_dir: str = BASE_TEMP_DIR, |
|
output_dir: str = BASE_OUTPUT_DIR, |
|
grobid_config: Optional[Dict] = None, |
|
verbose: bool = True, |
|
) -> str: |
|
""" |
|
Process a PDF file and get JSON representation |
|
:param input_file: |
|
:param temp_dir: |
|
:param output_dir: |
|
:return: |
|
""" |
|
os.makedirs(temp_dir, exist_ok=True) |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
paper_id = ".".join(input_file.split("/")[-1].split(".")[:-1]) |
|
tei_file = os.path.join(temp_dir, f"{paper_id}.tei.xml") |
|
output_file = os.path.join(output_dir, f"{paper_id}.json") |
|
|
|
|
|
if not os.path.exists(input_file): |
|
raise FileNotFoundError(f"{input_file} doesn't exist") |
|
if os.path.exists(output_file): |
|
if verbose: |
|
print(f"{output_file} already exists!") |
|
return output_file |
|
|
|
|
|
client = GrobidClient(grobid_config) |
|
|
|
|
|
client.process_pdf(input_file, temp_dir, "processFulltextDocument") |
|
|
|
|
|
assert os.path.exists(tei_file) |
|
paper = convert_tei_xml_file_to_s2orc_json(tei_file) |
|
|
|
|
|
with open(output_file, "w") as outf: |
|
json.dump(paper.release_json(), outf, indent=4, sort_keys=False) |
|
|
|
return output_file |
|
|
|
|
|
UUID_NAMESPACE = uuid.UUID("bab08d37-ac12-40c4-847a-20ca337742fd") |
|
|
|
|
|
def paper_url_to_uuid(paper_url: str) -> "uuid.UUID": |
|
return uuid.uuid5(UUID_NAMESPACE, paper_url) |
|
|
|
|
|
@dataclass |
|
class PDFDownloader: |
|
verbose: bool = True |
|
|
|
def download(self, url: str, opath: str | Path) -> Path: |
|
"""Download a pdf file from URL and save locally. |
|
Skip if there is a file at `opath` already. |
|
|
|
Parameters |
|
---------- |
|
url : str |
|
URL of the target PDF file |
|
opath : str |
|
Path to save downloaded PDF data. |
|
""" |
|
if os.path.exists(opath): |
|
return Path(opath) |
|
|
|
if not os.path.exists(os.path.dirname(opath)): |
|
os.makedirs(os.path.dirname(opath), exist_ok=True) |
|
|
|
if self.verbose: |
|
print(f"Downloading {url} into {opath}") |
|
with open(opath, "wb") as f: |
|
res = requests.get(url) |
|
f.write(res.content) |
|
|
|
return Path(opath) |
|
|
|
|
|
@dataclass |
|
class FulltextExtractor: |
|
|
|
def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None: |
|
"""Extract plain text from a PDf file""" |
|
raise NotImplementedError |
|
|
|
|
|
@dataclass |
|
class GrobidFulltextExtractor(FulltextExtractor): |
|
tmp_dir: str = "./tmp/grobid" |
|
grobid_config: Optional[Dict] = None |
|
section_seperator: str = "\n\n" |
|
paragraph_seperator: str = "\n" |
|
verbose: bool = True |
|
|
|
def construct_plain_text(self, extraction_result: dict) -> str: |
|
|
|
section_strings = [] |
|
|
|
|
|
title = extraction_result.get("title") |
|
if title and title.strip(): |
|
section_strings.append(title.strip()) |
|
|
|
section_paragraphs: dict[str, list[str]] = extraction_result["sections"] |
|
section_strings.extend( |
|
self.paragraph_seperator.join( |
|
|
|
|
|
filter(lambda s: len(s) > 0, map(lambda s: s.strip(), [section_name] + paragraphs)) |
|
) |
|
for section_name, paragraphs in section_paragraphs.items() |
|
) |
|
|
|
return self.section_seperator.join(section_strings) |
|
|
|
def postprocess_extraction_result(self, extraction_result: dict) -> dict: |
|
|
|
|
|
sections: dict[str, list[str]] = {} |
|
for body_text in extraction_result["pdf_parse"]["body_text"]: |
|
section_name = body_text["section"] |
|
|
|
if section_name not in sections.keys(): |
|
sections[section_name] = [] |
|
sections[section_name] += [body_text["text"]] |
|
extraction_result = {**extraction_result, "sections": sections} |
|
|
|
return extraction_result |
|
|
|
def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None: |
|
"""Extract plain text from a PDf file""" |
|
try: |
|
extraction_fpath = process_pdf_file( |
|
str(pdf_file_path), |
|
temp_dir=self.tmp_dir, |
|
output_dir=self.tmp_dir, |
|
grobid_config=self.grobid_config, |
|
verbose=self.verbose, |
|
) |
|
with open(extraction_fpath, "r") as f: |
|
extraction_result = json.load(f) |
|
|
|
processed_extraction_result = self.postprocess_extraction_result(extraction_result) |
|
plain_text = self.construct_plain_text(processed_extraction_result) |
|
return plain_text, extraction_result |
|
except AssertionError: |
|
print("Grobid failed to parse this document.") |
|
return None |
|
|