|
from dataclasses import dataclass |
|
from typing import Iterator |
|
|
|
from acl_anthology import Anthology |
|
|
|
from .process_pdf import paper_url_to_uuid |
|
from .raw_paper import RawPaper |
|
|
|
|
|
@dataclass |
|
class XML2RawPapers: |
|
anthology: Anthology |
|
collection_id_filters: list[str] | None = None |
|
venue_id_whitelist: list[str] | None = None |
|
verbose: bool = True |
|
|
|
def __call__(self, *args, **kwargs) -> Iterator[RawPaper]: |
|
|
|
for collection_id, collection in self.anthology.collections.items(): |
|
if self.collection_id_filters is not None: |
|
if not any( |
|
[ |
|
collection_id.find(filter_str) != -1 |
|
for filter_str in self.collection_id_filters |
|
] |
|
): |
|
continue |
|
if self.verbose: |
|
print(f"Processing collection: {collection_id}") |
|
for volume in collection.volumes(): |
|
if self.venue_id_whitelist is not None: |
|
if not any( |
|
[venue_id in volume.venue_ids for venue_id in self.venue_id_whitelist] |
|
): |
|
continue |
|
|
|
volume_id = f"{collection_id}-{volume.id}" |
|
|
|
for paper in volume.papers(): |
|
fulltext, abstract = None, None |
|
if ( |
|
paper.pdf is not None |
|
and paper.pdf.name is not None |
|
and paper.pdf.name.find("http") == -1 |
|
): |
|
name = paper.pdf.name |
|
else: |
|
name = ( |
|
f"{volume_id}.{paper.id.rjust(3, '0')}" |
|
if len(collection_id) == 1 |
|
else f"{volume_id}.{paper.id}" |
|
) |
|
|
|
paper_uuid = paper_url_to_uuid(name) |
|
raw_paper = RawPaper( |
|
paper_uuid=str(paper_uuid), |
|
name=name, |
|
collection_id=collection_id, |
|
collection_acronym=volume.venues()[0].acronym, |
|
volume_id=volume_id, |
|
booktitle=volume.title.as_text(), |
|
paper_id=int(paper.id), |
|
year=int(paper.year), |
|
paper_title=paper.title.as_text(), |
|
authors=[ |
|
{"first": author.first, "last": author.last} |
|
for author in paper.authors |
|
], |
|
abstract=( |
|
paper.abstract.as_text() if paper.abstract is not None else abstract |
|
), |
|
url=paper.pdf.url if paper.pdf is not None else None, |
|
bibkey=paper.bibkey if paper.bibkey is not None else None, |
|
doi=paper.doi if paper.doi is not None else None, |
|
fulltext=fulltext, |
|
) |
|
yield raw_paper |
|
|