from typing import Any, Dict from .s2orc_paper import METADATA_KEYS, Paper def load_s2orc(paper_dict: Dict[str, Any]) -> Paper: """ Load release S2ORC into Paper class :param paper_dict: :return: """ paper_id = paper_dict["paper_id"] pdf_hash = paper_dict.get("_pdf_hash", paper_dict.get("s2_pdf_hash", None)) # 2019 gorc parses grobid_parse = paper_dict.get("grobid_parse") if grobid_parse: metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS} abstract = grobid_parse.get("abstract", []) body_text = grobid_parse.get("body_text", []) back_matter = grobid_parse.get("back_matter", []) bib_entries = grobid_parse.get("bib_entries", {}) for k, v in bib_entries.items(): if "link" in v: v["links"] = [v["link"]] ref_entries = grobid_parse.get("ref_entries", {}) # current and 2020 s2orc release_json elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or ( "body_text" in paper_dict and paper_dict.get("body_text") ): if "pdf_parse" in paper_dict: paper_dict = paper_dict["pdf_parse"] if paper_dict.get("metadata"): metadata = { k: v for k, v in paper_dict.get("metadata", {}).items() if k in METADATA_KEYS } # 2020 s2orc releases (metadata is separate) else: metadata = {"title": None, "authors": [], "year": None} abstract = paper_dict.get("abstract", []) body_text = paper_dict.get("body_text", []) back_matter = paper_dict.get("back_matter", []) bib_entries = paper_dict.get("bib_entries", {}) for k, v in bib_entries.items(): if "link" in v: v["links"] = [v["link"]] ref_entries = paper_dict.get("ref_entries", {}) else: print(paper_id) raise NotImplementedError("Unknown S2ORC file type!") return Paper( paper_id=paper_id, pdf_hash=pdf_hash, metadata=metadata, abstract=abstract, body_text=body_text, back_matter=back_matter, bib_entries=bib_entries, ref_entries=ref_entries, )