from dataclasses import dataclass from typing import Iterator from acl_anthology import Anthology from .process_pdf import paper_url_to_uuid from .raw_paper import RawPaper @dataclass class XML2RawPapers: anthology: Anthology collection_id_filters: list[str] | None = None venue_id_whitelist: list[str] | None = None verbose: bool = True def __call__(self, *args, **kwargs) -> Iterator[RawPaper]: for collection_id, collection in self.anthology.collections.items(): if self.collection_id_filters is not None: if not any( [ collection_id.find(filter_str) != -1 for filter_str in self.collection_id_filters ] ): continue if self.verbose: print(f"Processing collection: {collection_id}") for volume in collection.volumes(): if self.venue_id_whitelist is not None: if not any( [venue_id in volume.venue_ids for venue_id in self.venue_id_whitelist] ): continue volume_id = f"{collection_id}-{volume.id}" for paper in volume.papers(): fulltext, abstract = None, None if ( paper.pdf is not None and paper.pdf.name is not None and paper.pdf.name.find("http") == -1 ): name = paper.pdf.name else: name = ( f"{volume_id}.{paper.id.rjust(3, '0')}" if len(collection_id) == 1 else f"{volume_id}.{paper.id}" ) paper_uuid = paper_url_to_uuid(name) raw_paper = RawPaper( paper_uuid=str(paper_uuid), name=name, collection_id=collection_id, collection_acronym=volume.venues()[0].acronym, volume_id=volume_id, booktitle=volume.title.as_text(), paper_id=int(paper.id), year=int(paper.year), paper_title=paper.title.as_text(), authors=[ {"first": author.first, "last": author.last} for author in paper.authors ], abstract=( paper.abstract.as_text() if paper.abstract is not None else abstract ), url=paper.pdf.url if paper.pdf is not None else None, bibkey=paper.bibkey if paper.bibkey is not None else None, doi=paper.doi if paper.doi is not None else None, fulltext=fulltext, ) yield raw_paper