File size: 3,121 Bytes
ced4316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from dataclasses import dataclass
from typing import Iterator

from acl_anthology import Anthology

from .process_pdf import paper_url_to_uuid
from .raw_paper import RawPaper


@dataclass
class XML2RawPapers:
    anthology: Anthology
    collection_id_filters: list[str] | None = None
    venue_id_whitelist: list[str] | None = None
    verbose: bool = True

    def __call__(self, *args, **kwargs) -> Iterator[RawPaper]:

        for collection_id, collection in self.anthology.collections.items():
            if self.collection_id_filters is not None:
                if not any(
                    [
                        collection_id.find(filter_str) != -1
                        for filter_str in self.collection_id_filters
                    ]
                ):
                    continue
            if self.verbose:
                print(f"Processing collection: {collection_id}")
            for volume in collection.volumes():
                if self.venue_id_whitelist is not None:
                    if not any(
                        [venue_id in volume.venue_ids for venue_id in self.venue_id_whitelist]
                    ):
                        continue

                volume_id = f"{collection_id}-{volume.id}"

                for paper in volume.papers():
                    fulltext, abstract = None, None
                    if (
                        paper.pdf is not None
                        and paper.pdf.name is not None
                        and paper.pdf.name.find("http") == -1
                    ):
                        name = paper.pdf.name
                    else:
                        name = (
                            f"{volume_id}.{paper.id.rjust(3, '0')}"
                            if len(collection_id) == 1
                            else f"{volume_id}.{paper.id}"
                        )

                    paper_uuid = paper_url_to_uuid(name)
                    raw_paper = RawPaper(
                        paper_uuid=str(paper_uuid),
                        name=name,
                        collection_id=collection_id,
                        collection_acronym=volume.venues()[0].acronym,
                        volume_id=volume_id,
                        booktitle=volume.title.as_text(),
                        paper_id=int(paper.id),
                        year=int(paper.year),
                        paper_title=paper.title.as_text(),
                        authors=[
                            {"first": author.first, "last": author.last}
                            for author in paper.authors
                        ],
                        abstract=(
                            paper.abstract.as_text() if paper.abstract is not None else abstract
                        ),
                        url=paper.pdf.url if paper.pdf is not None else None,
                        bibkey=paper.bibkey if paper.bibkey is not None else None,
                        doi=paper.doi if paper.doi is not None else None,
                        fulltext=fulltext,
                    )
                    yield raw_paper