File size: 2,237 Bytes
ced4316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from typing import Any, Dict

from .s2orc_paper import METADATA_KEYS, Paper


def load_s2orc(paper_dict: Dict[str, Any]) -> Paper:
    """
    Load release S2ORC into Paper class
    :param paper_dict:
    :return:
    """
    paper_id = paper_dict["paper_id"]
    pdf_hash = paper_dict.get("_pdf_hash", paper_dict.get("s2_pdf_hash", None))

    # 2019 gorc parses
    grobid_parse = paper_dict.get("grobid_parse")
    if grobid_parse:
        metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS}
        abstract = grobid_parse.get("abstract", [])
        body_text = grobid_parse.get("body_text", [])
        back_matter = grobid_parse.get("back_matter", [])
        bib_entries = grobid_parse.get("bib_entries", {})
        for k, v in bib_entries.items():
            if "link" in v:
                v["links"] = [v["link"]]
        ref_entries = grobid_parse.get("ref_entries", {})
    # current and 2020 s2orc release_json
    elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or (
        "body_text" in paper_dict and paper_dict.get("body_text")
    ):
        if "pdf_parse" in paper_dict:
            paper_dict = paper_dict["pdf_parse"]
        if paper_dict.get("metadata"):
            metadata = {
                k: v for k, v in paper_dict.get("metadata", {}).items() if k in METADATA_KEYS
            }
        # 2020 s2orc releases (metadata is separate)
        else:
            metadata = {"title": None, "authors": [], "year": None}
        abstract = paper_dict.get("abstract", [])
        body_text = paper_dict.get("body_text", [])
        back_matter = paper_dict.get("back_matter", [])
        bib_entries = paper_dict.get("bib_entries", {})
        for k, v in bib_entries.items():
            if "link" in v:
                v["links"] = [v["link"]]
        ref_entries = paper_dict.get("ref_entries", {})
    else:
        print(paper_id)
        raise NotImplementedError("Unknown S2ORC file type!")

    return Paper(
        paper_id=paper_id,
        pdf_hash=pdf_hash,
        metadata=metadata,
        abstract=abstract,
        body_text=body_text,
        back_matter=back_matter,
        bib_entries=bib_entries,
        ref_entries=ref_entries,
    )