File size: 4,223 Bytes
ced4316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pyrootutils

root = pyrootutils.setup_root(
    search_from=__file__,
    indicator=[".project-root"],
    pythonpath=True,
    dotenv=True,
)

import os
from argparse import ArgumentParser, RawTextHelpFormatter
from dataclasses import dataclass, field
from pathlib import Path

from acl_anthology import Anthology
from tqdm import tqdm

from src.utils.pdf_utils.acl_anthology_utils import XML2RawPapers
from src.utils.pdf_utils.process_pdf import (
    FulltextExtractor,
    GrobidFulltextExtractor,
    PDFDownloader,
)

HELP_MSG = """
Generate paper json files from an ACL Anthology collection, with fulltext extraction.

Iterate over entries in the ACL Anthology metadata, and for each entry:
1. extract relevant paper info from the xml entry
2. download pdf file
3. extract fulltext
4. format a json file and save

pre-requisites:
- Install the requirements: pip install acl-anthology-py>=0.4.3 bs4 jsonschema
- Get the meta data from ACL Anthology: git clone [email protected]:acl-org/acl-anthology.git
- Start Grobid Docker container: docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0
"""


@dataclass
class XML2Jsons:
    base_output_dir: Path
    pdf_output_dir: Path

    xml2raw_papers: XML2RawPapers
    pdf_downloader: PDFDownloader = field(default_factory=PDFDownloader)
    fulltext_extractor: FulltextExtractor = field(default_factory=GrobidFulltextExtractor)
    show_progress: bool = True

    @classmethod
    def from_cli(cls) -> "XML2Jsons":
        parser = ArgumentParser(description=HELP_MSG, formatter_class=RawTextHelpFormatter)
        parser.add_argument(
            "--base-output-dir", type=str, help="Directory to save all the paper json files"
        )
        parser.add_argument(
            "--pdf-output-dir", type=str, help="Directory to save all the downloaded pdf files"
        )
        parser.add_argument(
            "--anthology-data-dir",
            type=str,
            help="Path to ACL Anthology metadata directory, e.g., /path/to/acl-anthology-repo/data. "
            "You can obtain the data via: git clone [email protected]:acl-org/acl-anthology.git",
        )
        parser.add_argument(
            "--collection-id-filters",
            nargs="+",
            type=str,
            default=None,
            help="If provided, only papers from the collections whose id (Anthology ID) contains the "
            "specified strings will be processed.",
        )
        parser.add_argument(
            "--venue-id-whitelist",
            nargs="+",
            type=str,
            default=None,
            help="If provided, only papers from the specified venues will be processed. See here for "
            "the list of venues: https://aclanthology.org/venues",
        )
        args = parser.parse_args()

        return cls(
            base_output_dir=Path(args.base_output_dir),
            pdf_output_dir=Path(args.pdf_output_dir),
            xml2raw_papers=XML2RawPapers(
                anthology=Anthology(datadir=args.anthology_data_dir),
                collection_id_filters=args.collection_id_filters,
                venue_id_whitelist=args.venue_id_whitelist,
            ),
        )

    def run(self):
        os.makedirs(self.pdf_output_dir, exist_ok=True)
        papers = self.xml2raw_papers()
        if self.show_progress:
            papers = tqdm(list(papers), desc="extracting fulltext")
        for paper in papers:
            volume_dir = self.base_output_dir / paper.volume_id
            if paper.url is not None:
                pdf_save_path = self.pdf_downloader.download(
                    paper.url, opath=self.pdf_output_dir / f"{paper.name}.pdf"
                )
                fulltext_extraction_output = self.fulltext_extractor(pdf_save_path)

                if fulltext_extraction_output:
                    plain_text, extraction_data = fulltext_extraction_output
                    paper.fulltext = extraction_data.get("sections")
                    if not paper.abstract:
                        paper.abstract = extraction_data.get("abstract")
                paper.save(str(volume_dir))


if __name__ == "__main__":
    xml2jsons = XML2Jsons.from_cli()
    xml2jsons.run()