|
from typing import Any, Dict |
|
|
|
from .s2orc_paper import METADATA_KEYS, Paper |
|
|
|
|
|
def load_s2orc(paper_dict: Dict[str, Any]) -> Paper: |
|
""" |
|
Load release S2ORC into Paper class |
|
:param paper_dict: |
|
:return: |
|
""" |
|
paper_id = paper_dict["paper_id"] |
|
pdf_hash = paper_dict.get("_pdf_hash", paper_dict.get("s2_pdf_hash", None)) |
|
|
|
|
|
grobid_parse = paper_dict.get("grobid_parse") |
|
if grobid_parse: |
|
metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS} |
|
abstract = grobid_parse.get("abstract", []) |
|
body_text = grobid_parse.get("body_text", []) |
|
back_matter = grobid_parse.get("back_matter", []) |
|
bib_entries = grobid_parse.get("bib_entries", {}) |
|
for k, v in bib_entries.items(): |
|
if "link" in v: |
|
v["links"] = [v["link"]] |
|
ref_entries = grobid_parse.get("ref_entries", {}) |
|
|
|
elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or ( |
|
"body_text" in paper_dict and paper_dict.get("body_text") |
|
): |
|
if "pdf_parse" in paper_dict: |
|
paper_dict = paper_dict["pdf_parse"] |
|
if paper_dict.get("metadata"): |
|
metadata = { |
|
k: v for k, v in paper_dict.get("metadata", {}).items() if k in METADATA_KEYS |
|
} |
|
|
|
else: |
|
metadata = {"title": None, "authors": [], "year": None} |
|
abstract = paper_dict.get("abstract", []) |
|
body_text = paper_dict.get("body_text", []) |
|
back_matter = paper_dict.get("back_matter", []) |
|
bib_entries = paper_dict.get("bib_entries", {}) |
|
for k, v in bib_entries.items(): |
|
if "link" in v: |
|
v["links"] = [v["link"]] |
|
ref_entries = paper_dict.get("ref_entries", {}) |
|
else: |
|
print(paper_id) |
|
raise NotImplementedError("Unknown S2ORC file type!") |
|
|
|
return Paper( |
|
paper_id=paper_id, |
|
pdf_hash=pdf_hash, |
|
metadata=metadata, |
|
abstract=abstract, |
|
body_text=body_text, |
|
back_matter=back_matter, |
|
bib_entries=bib_entries, |
|
ref_entries=ref_entries, |
|
) |
|
|