ArneBinder's picture
update from https://github.com/ArneBinder/pie-document-level/pull/397
ced4316 verified
from dataclasses import dataclass
from typing import Iterator
from acl_anthology import Anthology
from .process_pdf import paper_url_to_uuid
from .raw_paper import RawPaper
@dataclass
class XML2RawPapers:
anthology: Anthology
collection_id_filters: list[str] | None = None
venue_id_whitelist: list[str] | None = None
verbose: bool = True
def __call__(self, *args, **kwargs) -> Iterator[RawPaper]:
for collection_id, collection in self.anthology.collections.items():
if self.collection_id_filters is not None:
if not any(
[
collection_id.find(filter_str) != -1
for filter_str in self.collection_id_filters
]
):
continue
if self.verbose:
print(f"Processing collection: {collection_id}")
for volume in collection.volumes():
if self.venue_id_whitelist is not None:
if not any(
[venue_id in volume.venue_ids for venue_id in self.venue_id_whitelist]
):
continue
volume_id = f"{collection_id}-{volume.id}"
for paper in volume.papers():
fulltext, abstract = None, None
if (
paper.pdf is not None
and paper.pdf.name is not None
and paper.pdf.name.find("http") == -1
):
name = paper.pdf.name
else:
name = (
f"{volume_id}.{paper.id.rjust(3, '0')}"
if len(collection_id) == 1
else f"{volume_id}.{paper.id}"
)
paper_uuid = paper_url_to_uuid(name)
raw_paper = RawPaper(
paper_uuid=str(paper_uuid),
name=name,
collection_id=collection_id,
collection_acronym=volume.venues()[0].acronym,
volume_id=volume_id,
booktitle=volume.title.as_text(),
paper_id=int(paper.id),
year=int(paper.year),
paper_title=paper.title.as_text(),
authors=[
{"first": author.first, "last": author.last}
for author in paper.authors
],
abstract=(
paper.abstract.as_text() if paper.abstract is not None else abstract
),
url=paper.pdf.url if paper.pdf is not None else None,
bibkey=paper.bibkey if paper.bibkey is not None else None,
doi=paper.doi if paper.doi is not None else None,
fulltext=fulltext,
)
yield raw_paper