Spaces:

ArneBinder
/

ScientificArgumentRecommender

Running

File size: 8,833 Bytes

ced4316

import json
import os
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional

import requests
from bs4 import BeautifulSoup

from .grobid_client import GrobidClient
from .grobid_util import extract_paper_metadata_from_grobid_xml, parse_bibliography
from .s2orc_paper import Paper
from .utils import (
    _clean_empty_and_duplicate_authors_from_grobid_parse,
    check_if_citations_are_bracket_style,
    extract_abstract_from_tei_xml,
    extract_back_matter_from_tei_xml,
    extract_body_text_from_tei_xml,
    extract_figures_and_tables_from_tei_xml,
    normalize_grobid_id,
    sub_all_note_tags,
)

BASE_TEMP_DIR = "./grobid/temp"
BASE_OUTPUT_DIR = "./grobid/output"
BASE_LOG_DIR = "./grobid/log"


def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper:
    """
    Convert Grobid TEI XML to S2ORC json format
    :param soup: BeautifulSoup of XML file content
    :param paper_id: name of file
    :param pdf_hash: hash of PDF
    :return:
    """
    # extract metadata
    metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc)
    # clean metadata authors (remove dupes etc)
    metadata["authors"] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata["authors"])

    # parse bibliography entries (removes empty bib entries)
    biblio_entries = parse_bibliography(soup)
    bibkey_map = {normalize_grobid_id(bib["ref_id"]): bib for bib in biblio_entries}

    # # process formulas and replace with text
    # extract_formulas_from_tei_xml(soup)

    # extract figure and table captions
    refkey_map = extract_figures_and_tables_from_tei_xml(soup)

    # get bracket style
    is_bracket_style = check_if_citations_are_bracket_style(soup)

    # substitute all note tags with p tags
    soup = sub_all_note_tags(soup)

    # process abstract if possible
    abstract_entries = extract_abstract_from_tei_xml(
        soup, bibkey_map, refkey_map, is_bracket_style
    )

    # process body text
    body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)

    # parse back matter (acks, author statements, competing interests, abbrevs etc)
    back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)

    # form final paper entry
    return Paper(
        paper_id=paper_id,
        pdf_hash=pdf_hash,
        metadata=metadata,
        abstract=abstract_entries,
        body_text=body_entries,
        back_matter=back_matter,
        bib_entries=bibkey_map,
        ref_entries=refkey_map,
    )


def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper:
    """
    Convert a TEI XML file to S2ORC JSON
    :param tei_file:
    :param pdf_hash:
    :return:
    """
    if not os.path.exists(tei_file):
        raise FileNotFoundError("Input TEI XML file doesn't exist")
    paper_id = tei_file.split("/")[-1].split(".")[0]
    soup = BeautifulSoup(open(tei_file, "rb").read(), "xml")
    paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash)
    return paper


def process_pdf_stream(
    input_file: str, sha: str, input_stream: bytes, grobid_config: Optional[Dict] = None
) -> Dict:
    """
    Process PDF stream
    :param input_file:
    :param sha:
    :param input_stream:
    :return:
    """
    # process PDF through Grobid -> TEI.XML
    client = GrobidClient(grobid_config)
    tei_text = client.process_pdf_stream(
        input_file, input_stream, "temp", "processFulltextDocument"
    )

    # make soup
    soup = BeautifulSoup(tei_text, "xml")

    # get paper
    paper = convert_tei_xml_soup_to_s2orc_json(soup, input_file, sha)

    return paper.release_json("pdf")


def process_pdf_file(
    input_file: str,
    temp_dir: str = BASE_TEMP_DIR,
    output_dir: str = BASE_OUTPUT_DIR,
    grobid_config: Optional[Dict] = None,
    verbose: bool = True,
) -> str:
    """
    Process a PDF file and get JSON representation
    :param input_file:
    :param temp_dir:
    :param output_dir:
    :return:
    """
    os.makedirs(temp_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    # get paper id as the name of the file
    paper_id = ".".join(input_file.split("/")[-1].split(".")[:-1])
    tei_file = os.path.join(temp_dir, f"{paper_id}.tei.xml")
    output_file = os.path.join(output_dir, f"{paper_id}.json")

    # check if input file exists and output file doesn't
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"{input_file} doesn't exist")
    if os.path.exists(output_file):
        if verbose:
            print(f"{output_file} already exists!")
        return output_file

    # process PDF through Grobid -> TEI.XML
    client = GrobidClient(grobid_config)
    # TODO: compute PDF hash
    # TODO: add grobid version number to output
    client.process_pdf(input_file, temp_dir, "processFulltextDocument")

    # process TEI.XML -> JSON
    assert os.path.exists(tei_file)
    paper = convert_tei_xml_file_to_s2orc_json(tei_file)

    # write to file
    with open(output_file, "w") as outf:
        json.dump(paper.release_json(), outf, indent=4, sort_keys=False)

    return output_file


UUID_NAMESPACE = uuid.UUID("bab08d37-ac12-40c4-847a-20ca337742fd")


def paper_url_to_uuid(paper_url: str) -> "uuid.UUID":
    return uuid.uuid5(UUID_NAMESPACE, paper_url)


@dataclass
class PDFDownloader:
    verbose: bool = True

    def download(self, url: str, opath: str | Path) -> Path:
        """Download a pdf file from URL and save locally.
        Skip if there is a file at `opath` already.

        Parameters
        ----------
        url : str
            URL of the target PDF file
        opath : str
            Path to save downloaded PDF data.
        """
        if os.path.exists(opath):
            return Path(opath)

        if not os.path.exists(os.path.dirname(opath)):
            os.makedirs(os.path.dirname(opath), exist_ok=True)

        if self.verbose:
            print(f"Downloading {url} into {opath}")
        with open(opath, "wb") as f:
            res = requests.get(url)
            f.write(res.content)

        return Path(opath)


@dataclass
class FulltextExtractor:

    def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None:
        """Extract plain text from a PDf file"""
        raise NotImplementedError


@dataclass
class GrobidFulltextExtractor(FulltextExtractor):
    tmp_dir: str = "./tmp/grobid"
    grobid_config: Optional[Dict] = None
    section_seperator: str = "\n\n"
    paragraph_seperator: str = "\n"
    verbose: bool = True

    def construct_plain_text(self, extraction_result: dict) -> str:

        section_strings = []

        # add the title, if available (consider it as the first section)
        title = extraction_result.get("title")
        if title and title.strip():
            section_strings.append(title.strip())

        section_paragraphs: dict[str, list[str]] = extraction_result["sections"]
        section_strings.extend(
            self.paragraph_seperator.join(
                # consider the section title as the first paragraph and
                # remove empty paragraphs
                filter(lambda s: len(s) > 0, map(lambda s: s.strip(), [section_name] + paragraphs))
            )
            for section_name, paragraphs in section_paragraphs.items()
        )

        return self.section_seperator.join(section_strings)

    def postprocess_extraction_result(self, extraction_result: dict) -> dict:

        # add sections
        sections: dict[str, list[str]] = {}
        for body_text in extraction_result["pdf_parse"]["body_text"]:
            section_name = body_text["section"]

            if section_name not in sections.keys():
                sections[section_name] = []
            sections[section_name] += [body_text["text"]]
        extraction_result = {**extraction_result, "sections": sections}

        return extraction_result

    def __call__(self, pdf_file_path: Path | str) -> tuple[str, dict] | None:
        """Extract plain text from a PDf file"""
        try:
            extraction_fpath = process_pdf_file(
                str(pdf_file_path),
                temp_dir=self.tmp_dir,
                output_dir=self.tmp_dir,
                grobid_config=self.grobid_config,
                verbose=self.verbose,
            )
            with open(extraction_fpath, "r") as f:
                extraction_result = json.load(f)

            processed_extraction_result = self.postprocess_extraction_result(extraction_result)
            plain_text = self.construct_plain_text(processed_extraction_result)
            return plain_text, extraction_result
        except AssertionError:
            print("Grobid failed to parse this document.")
            return None