import json import os import uuid from dataclasses import dataclass from pathlib import Path from typing import Dict, Optional import requests from bs4 import BeautifulSoup from .grobid_client import GrobidClient from .grobid_util import extract_paper_metadata_from_grobid_xml, parse_bibliography from .s2orc_paper import Paper from .utils import ( _clean_empty_and_duplicate_authors_from_grobid_parse, check_if_citations_are_bracket_style, extract_abstract_from_tei_xml, extract_back_matter_from_tei_xml, extract_body_text_from_tei_xml, extract_figures_and_tables_from_tei_xml, normalize_grobid_id, sub_all_note_tags, ) BASE_TEMP_DIR = "./grobid/temp" BASE_OUTPUT_DIR = "./grobid/output" BASE_LOG_DIR = "./grobid/log" def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper: """ Convert Grobid TEI XML to S2ORC json format :param soup: BeautifulSoup of XML file content :param paper_id: name of file :param pdf_hash: hash of PDF :return: """ # extract metadata metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc) # clean metadata authors (remove dupes etc) metadata["authors"] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata["authors"]) # parse bibliography entries (removes empty bib entries) biblio_entries = parse_bibliography(soup) bibkey_map = {normalize_grobid_id(bib["ref_id"]): bib for bib in biblio_entries} # # process formulas and replace with text # extract_formulas_from_tei_xml(soup) # extract figure and table captions refkey_map = extract_figures_and_tables_from_tei_xml(soup) # get bracket style is_bracket_style = check_if_citations_are_bracket_style(soup) # substitute all note tags with p tags soup = sub_all_note_tags(soup) # process abstract if possible abstract_entries = extract_abstract_from_tei_xml( soup, bibkey_map, refkey_map, is_bracket_style ) # process body text body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) # parse back matter (acks, author statements, competing interests, abbrevs etc) back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) # form final paper entry return Paper( paper_id=paper_id, pdf_hash=pdf_hash, metadata=metadata, abstract=abstract_entries, body_text=body_entries, back_matter=back_matter, bib_entries=bibkey_map, ref_entries=refkey_map, ) def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper: """ Convert a TEI XML file to S2ORC JSON :param tei_file: :param pdf_hash: :return: """ if not os.path.exists(tei_file): raise FileNotFoundError("Input TEI XML file doesn't exist") paper_id = tei_file.split("/")[-1].split(".")[0] soup = BeautifulSoup(open(tei_file, "rb").read(), "xml") paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash) return paper def process_pdf_stream( input_file: str, sha: str, input_stream: bytes, grobid_config: Optional[Dict] = None ) -> Dict: """ Process PDF stream :param input_file: :param sha: :param input_stream: :return: """ # process PDF through Grobid -> TEI.XML client = GrobidClient(grobid_config) tei_text = client.process_pdf_stream( input_file, input_stream, "temp", "processFulltextDocument" ) # make soup soup = BeautifulSoup(tei_text, "xml") # get paper paper = convert_tei_xml_soup_to_s2orc_json(soup, input_file, sha) return paper.release_json("pdf") def process_pdf_file( input_file: str, temp_dir: str = BASE_TEMP_DIR, output_dir: str = BASE_OUTPUT_DIR, grobid_config: Optional[Dict] = None, verbose: bool = True, ) -> str: """ Process a PDF file and get JSON representation :param input_file: :param temp_dir: :param output_dir: :return: """ os.makedirs(temp_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True) # get paper id as the name of the file paper_id = ".".join(input_file.split("/")[-1].split(".")[:-1]) tei_file = os.path.join(temp_dir, f"{paper_id}.tei.xml") output_file = os.path.join(output_dir, f"{paper_id}.json") # check if input file exists and output file doesn't if not os.path.exists(input_file): raise FileNotFoundError(f"{input_file} doesn't exist") if os.path.exists(output_file): if verbose: print(f"{output_file} already exists!") return output_file # process PDF through Grobid -> TEI.XML client = GrobidClient(grobid_config) # TODO: compute PDF hash # TODO: add grobid version number to output client.process_pdf(input_file, temp_dir, "processFulltextDocument") # process TEI.XML -> JSON assert os.path.exists(tei_file) paper = convert_tei_xml_file_to_s2orc_json(tei_file) # write to file with open(output_file, "w") as outf: json.dump(paper.release_json(), outf, indent=4, sort_keys=False) return output_file UUID_NAMESPACE = uuid.UUID("bab08d37-ac12-40c4-847a-20ca337742fd") def paper_url_to_uuid(paper_url: str) -> "uuid.UUID": return uuid.uuid5(UUID_NAMESPACE, paper_url) @dataclass class PDFDownloader: verbose: bool = True def download(self, url: str, opath: str | Path) -> Path: """Download a pdf file from URL and save locally. Skip if there is a file at `opath` already. Parameters ---------- url : str URL of the target PDF file opath : str Path to save downloaded PDF data. """ if os.path.exists(opath): return Path(opath) if not os.path.exists(os.path.dirname(opath)): os.makedirs(os.path.dirname(opath), exist_ok=True) if self.verbose: print(f"Downloading {url} into {opath}") with open(opath, "wb") as f: res = requests.get(url) f.write(res.content) return Path(opath) @dataclass class FulltextExtractor: def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None: """Extract plain text from a PDf file""" raise NotImplementedError @dataclass class GrobidFulltextExtractor(FulltextExtractor): tmp_dir: str = "./tmp/grobid" grobid_config: Optional[Dict] = None section_seperator: str = "\n\n" paragraph_seperator: str = "\n" verbose: bool = True def construct_plain_text(self, extraction_result: dict) -> str: section_strings = [] # add the title, if available (consider it as the first section) title = extraction_result.get("title") if title and title.strip(): section_strings.append(title.strip()) section_paragraphs: dict[str, list[str]] = extraction_result["sections"] section_strings.extend( self.paragraph_seperator.join( # consider the section title as the first paragraph and # remove empty paragraphs filter(lambda s: len(s) > 0, map(lambda s: s.strip(), [section_name] + paragraphs)) ) for section_name, paragraphs in section_paragraphs.items() ) return self.section_seperator.join(section_strings) def postprocess_extraction_result(self, extraction_result: dict) -> dict: # add sections sections: dict[str, list[str]] = {} for body_text in extraction_result["pdf_parse"]["body_text"]: section_name = body_text["section"] if section_name not in sections.keys(): sections[section_name] = [] sections[section_name] += [body_text["text"]] extraction_result = {**extraction_result, "sections": sections} return extraction_result def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None: """Extract plain text from a PDf file""" try: extraction_fpath = process_pdf_file( str(pdf_file_path), temp_dir=self.tmp_dir, output_dir=self.tmp_dir, grobid_config=self.grobid_config, verbose=self.verbose, ) with open(extraction_fpath, "r") as f: extraction_result = json.load(f) processed_extraction_result = self.postprocess_extraction_result(extraction_result) plain_text = self.construct_plain_text(processed_extraction_result) return plain_text, extraction_result except AssertionError: print("Grobid failed to parse this document.") return None