Spaces:

ArneBinder
/

ScientificArgumentRecommender

Running

App Files Files Community

ScientificArgumentRecommender / src /utils /pdf_utils /acl_anthology_utils.py

ArneBinder

update from https://github.com/ArneBinder/pie-document-level/pull/397

ced4316 verified 2 months ago

raw

history blame contribute delete

3.12 kB

	from dataclasses import dataclass
	from typing import Iterator

	from acl_anthology import Anthology

	from .process_pdf import paper_url_to_uuid
	from .raw_paper import RawPaper


	@dataclass
	class XML2RawPapers:
	anthology: Anthology
	collection_id_filters: list[str] \| None = None
	venue_id_whitelist: list[str] \| None = None
	verbose: bool = True

	def __call__(self, args, *kwargs) -> Iterator[RawPaper]:

	for collection_id, collection in self.anthology.collections.items():
	if self.collection_id_filters is not None:
	if not any(
	[
	collection_id.find(filter_str) != -1
	for filter_str in self.collection_id_filters
	]
	):
	continue
	if self.verbose:
	print(f"Processing collection: {collection_id}")
	for volume in collection.volumes():
	if self.venue_id_whitelist is not None:
	if not any(
	[venue_id in volume.venue_ids for venue_id in self.venue_id_whitelist]
	):
	continue

	volume_id = f"{collection_id}-{volume.id}"

	for paper in volume.papers():
	fulltext, abstract = None, None
	if (
	paper.pdf is not None
	and paper.pdf.name is not None
	and paper.pdf.name.find("http") == -1
	):
	name = paper.pdf.name
	else:
	name = (
	f"{volume_id}.{paper.id.rjust(3, '0')}"
	if len(collection_id) == 1
	else f"{volume_id}.{paper.id}"
	)

	paper_uuid = paper_url_to_uuid(name)
	raw_paper = RawPaper(
	paper_uuid=str(paper_uuid),
	name=name,
	collection_id=collection_id,
	collection_acronym=volume.venues()[0].acronym,
	volume_id=volume_id,
	booktitle=volume.title.as_text(),
	paper_id=int(paper.id),
	year=int(paper.year),
	paper_title=paper.title.as_text(),
	authors=[
	{"first": author.first, "last": author.last}
	for author in paper.authors
	],
	abstract=(
	paper.abstract.as_text() if paper.abstract is not None else abstract
	),
	url=paper.pdf.url if paper.pdf is not None else None,
	bibkey=paper.bibkey if paper.bibkey is not None else None,
	doi=paper.doi if paper.doi is not None else None,
	fulltext=fulltext,
	)
	yield raw_paper