File size: 2,237 Bytes
ced4316 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from typing import Any, Dict
from .s2orc_paper import METADATA_KEYS, Paper
def load_s2orc(paper_dict: Dict[str, Any]) -> Paper:
"""
Load release S2ORC into Paper class
:param paper_dict:
:return:
"""
paper_id = paper_dict["paper_id"]
pdf_hash = paper_dict.get("_pdf_hash", paper_dict.get("s2_pdf_hash", None))
# 2019 gorc parses
grobid_parse = paper_dict.get("grobid_parse")
if grobid_parse:
metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS}
abstract = grobid_parse.get("abstract", [])
body_text = grobid_parse.get("body_text", [])
back_matter = grobid_parse.get("back_matter", [])
bib_entries = grobid_parse.get("bib_entries", {})
for k, v in bib_entries.items():
if "link" in v:
v["links"] = [v["link"]]
ref_entries = grobid_parse.get("ref_entries", {})
# current and 2020 s2orc release_json
elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or (
"body_text" in paper_dict and paper_dict.get("body_text")
):
if "pdf_parse" in paper_dict:
paper_dict = paper_dict["pdf_parse"]
if paper_dict.get("metadata"):
metadata = {
k: v for k, v in paper_dict.get("metadata", {}).items() if k in METADATA_KEYS
}
# 2020 s2orc releases (metadata is separate)
else:
metadata = {"title": None, "authors": [], "year": None}
abstract = paper_dict.get("abstract", [])
body_text = paper_dict.get("body_text", [])
back_matter = paper_dict.get("back_matter", [])
bib_entries = paper_dict.get("bib_entries", {})
for k, v in bib_entries.items():
if "link" in v:
v["links"] = [v["link"]]
ref_entries = paper_dict.get("ref_entries", {})
else:
print(paper_id)
raise NotImplementedError("Unknown S2ORC file type!")
return Paper(
paper_id=paper_id,
pdf_hash=pdf_hash,
metadata=metadata,
abstract=abstract,
body_text=body_text,
back_matter=back_matter,
bib_entries=bib_entries,
ref_entries=ref_entries,
)
|