Spaces:

ArneBinder
/

ScientificArgumentRecommender

Running

File size: 32,262 Bytes

ced4316

import re
from typing import Dict, List, Tuple

import bs4


def replace_refspans(
    spans_to_replace: List[Tuple[int, int, str, str]],
    full_string: str,
    pre_padding: str = "",
    post_padding: str = "",
    btwn_padding: str = ", ",
) -> str:
    """
    For each span within the full string, replace that span with new text
    :param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring)
    :param full_string:
    :param pre_padding:
    :param post_padding:
    :param btwn_padding:
    :return:
    """
    # assert all spans are equal to full_text span
    assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace])

    # assert none of the spans start with the same start ind
    start_inds = [rep[0] for rep in spans_to_replace]
    assert len(set(start_inds)) == len(start_inds)

    # sort by start index
    spans_to_replace.sort(key=lambda x: x[0])

    # form strings for each span group
    for i, entry in enumerate(spans_to_replace):
        start, end, span, new_string = entry

        # skip empties
        if end <= 0:
            continue

        # compute shift amount
        shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding)

        # shift remaining appropriately
        for ind in range(i + 1, len(spans_to_replace)):
            next_start, next_end, next_span, next_string = spans_to_replace[ind]
            # skip empties
            if next_end <= 0:
                continue
            # if overlap between ref span and current ref span, remove from replacement
            if next_start < end:
                next_start = 0
                next_end = 0
                next_string = ""
            # if ref span abuts previous reference span
            elif next_start == end:
                next_start += shift_amount
                next_end += shift_amount
                next_string = btwn_padding + pre_padding + next_string + post_padding
            # if ref span starts after, shift starts and ends
            elif next_start > end:
                next_start += shift_amount
                next_end += shift_amount
                next_string = pre_padding + next_string + post_padding
            # save adjusted span
            spans_to_replace[ind] = (next_start, next_end, next_span, next_string)

    spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0]
    spans_to_replace.sort(key=lambda x: x[0])

    # apply shifts in series
    for start, end, span, new_string in spans_to_replace:
        assert full_string[start:end] == span
        full_string = full_string[:start] + new_string + full_string[end:]

    return full_string


BRACKET_REGEX = re.compile(r"\[[1-9]\d{0,2}([,;\-\s]+[1-9]\d{0,2})*;?\]")
BRACKET_STYLE_THRESHOLD = 5

SINGLE_BRACKET_REGEX = re.compile(r"\[([1-9]\d{0,2})\]")
EXPANSION_CHARS = {"-", "–"}

REPLACE_TABLE_TOKS = {
    "<row>": "<tr>",
    "<row/>": "<tr/>",
    "</row>": "</tr>",
    "<cell>": "<td>",
    "<cell/>": "<td/>",
    "</cell>": "</td>",
    "<cell ": "<td ",
    "cols=": "colspan=",
}


def span_already_added(sub_start: int, sub_end: int, span_indices: List[Tuple[int, int]]) -> bool:
    """
    Check if span is a subspan of existing span
    :param sub_start:
    :param sub_end:
    :param span_indices:
    :return:
    """
    for span_start, span_end in span_indices:
        if sub_start >= span_start and sub_end <= span_end:
            return True
    return False


def is_expansion_string(between_string: str) -> bool:
    """
    Check if the string between two refs is an expansion string
    :param between_string:
    :return:
    """
    if (
        len(between_string) <= 2
        and any([c in EXPANSION_CHARS for c in between_string])
        and all([c in EXPANSION_CHARS.union({" "}) for c in between_string])
    ):
        return True
    return False


# TODO: still cases like `09bcee03baceb509d4fcf736fa1322cb8adf507f` w/ dups like ['L Jung', 'R Hessler', 'Louis Jung', 'Roland Hessler']
# example paper that has empties & duplicates: `09bce26cc7e825e15a4469e3e78b7a54898bb97f`
def _clean_empty_and_duplicate_authors_from_grobid_parse(
    authors: List[Dict],
) -> List[Dict]:
    """
    Within affiliation, `location` is a dict with fields <settlement>, <region>, <country>, <postCode>, etc.
    Too much hassle, so just take the first one that's not empty.
    """
    # stripping empties
    clean_authors_list = []
    for author in authors:
        clean_first = author["first"].strip()
        clean_last = author["last"].strip()
        clean_middle = [m.strip() for m in author["middle"]]
        clean_suffix = author["suffix"].strip()
        if clean_first or clean_last or clean_middle:
            author["first"] = clean_first
            author["last"] = clean_last
            author["middle"] = clean_middle
            author["suffix"] = clean_suffix
            clean_authors_list.append(author)
    # combining duplicates (preserve first occurrence of author name as position)
    key_to_author_blobs = {}
    ordered_keys_by_author_pos = []
    for author in clean_authors_list:
        key = (
            author["first"],
            author["last"],
            " ".join(author["middle"]),
            author["suffix"],
        )
        if key not in key_to_author_blobs:
            key_to_author_blobs[key] = author
            ordered_keys_by_author_pos.append(key)
        else:
            if author["email"]:
                key_to_author_blobs[key]["email"] = author["email"]
            if author["affiliation"] and (
                author["affiliation"]["institution"]
                or author["affiliation"]["laboratory"]
                or author["affiliation"]["location"]
            ):
                key_to_author_blobs[key]["affiliation"] = author["affiliation"]
    dedup_authors_list = [key_to_author_blobs[key] for key in ordered_keys_by_author_pos]
    return dedup_authors_list


def sub_spans_and_update_indices(
    spans_to_replace: List[Tuple[int, int, str, str]], full_string: str
) -> Tuple[str, List]:
    """
    Replace all spans and recompute indices
    :param spans_to_replace:
    :param full_string:
    :return:
    """
    # TODO: check no spans overlapping
    # TODO: check all spans well-formed

    # assert all spans are equal to full_text span
    assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace])

    # assert none of the spans start with the same start ind
    start_inds = [rep[0] for rep in spans_to_replace]
    assert len(set(start_inds)) == len(start_inds)

    # sort by start index
    spans_to_replace.sort(key=lambda x: x[0])

    # compute offsets for each span
    new_spans = [
        (start, end, token, surface, 0) for start, end, token, surface in spans_to_replace
    ]
    for i, entry in enumerate(spans_to_replace):
        start, end, token, surface = entry
        new_end = start + len(surface)
        offset = new_end - end
        # new_spans[i][1] += offset
        new_spans[i] = (
            new_spans[i][0],
            new_spans[i][1] + offset,
            new_spans[i][2],
            new_spans[i][3],
            new_spans[i][4],
        )
        # for new_span_entry in new_spans[i + 1 :]:
        #    new_span_entry[4] += offset
        for j in range(i + 1, len(new_spans)):
            new_spans[j] = (
                new_spans[j][0],
                new_spans[j][1],
                new_spans[j][2],
                new_spans[j][3],
                new_spans[j][4] + offset,
            )

    # generate new text and create final spans
    new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="")
    result = [
        (start + offset, end + offset, token, surface)
        for start, end, token, surface, offset in new_spans
    ]

    return new_text, result


class UniqTokenGenerator:
    """
    Generate unique token
    """

    def __init__(self, tok_string):
        self.tok_string = tok_string
        self.ind = 0

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        new_token = f"{self.tok_string}{self.ind}"
        self.ind += 1
        return new_token


def normalize_grobid_id(grobid_id: str):
    """
    Normalize grobid object identifiers
    :param grobid_id:
    :return:
    """
    str_norm = grobid_id.upper().replace("_", "").replace("#", "")
    if str_norm.startswith("B"):
        return str_norm.replace("B", "BIBREF")
    if str_norm.startswith("TAB"):
        return str_norm.replace("TAB", "TABREF")
    if str_norm.startswith("FIG"):
        return str_norm.replace("FIG", "FIGREF")
    if str_norm.startswith("FORMULA"):
        return str_norm.replace("FORMULA", "EQREF")
    return str_norm


def extract_formulas_from_tei_xml(sp: bs4.BeautifulSoup) -> None:
    """
    Replace all formulas with the text
    :param sp:
    :return:
    """
    for eq in sp.find_all("formula"):
        eq.replace_with(sp.new_string(eq.text.strip()))


def table_to_html(table: bs4.element.Tag) -> str:
    """
    Sub table tags with html table tags
    :param table_str:
    :return:
    """
    for tag in table:
        if tag.name != "row":
            print(f"Unknown table subtag: {tag.name}")
            tag.decompose()
    table_str = str(table)
    for token, subtoken in REPLACE_TABLE_TOKS.items():
        table_str = table_str.replace(token, subtoken)
    return table_str


def extract_figures_and_tables_from_tei_xml(sp: bs4.BeautifulSoup) -> Dict[str, Dict]:
    """
    Generate figure and table dicts
    :param sp:
    :return:
    """
    ref_map = dict()

    for fig in sp.find_all("figure"):
        try:
            if fig.name and fig.get("xml:id"):
                if fig.get("type") == "table":
                    ref_map[normalize_grobid_id(fig.get("xml:id"))] = {
                        "text": (
                            fig.figDesc.text.strip()
                            if fig.figDesc
                            else fig.head.text.strip() if fig.head else ""
                        ),
                        "latex": None,
                        "type": "table",
                        "content": table_to_html(fig.table),
                        "fig_num": fig.get("xml:id"),
                    }
                else:
                    if True in [char.isdigit() for char in fig.findNext("head").findNext("label")]:
                        fig_num = fig.findNext("head").findNext("label").contents[0]
                    else:
                        fig_num = None
                    ref_map[normalize_grobid_id(fig.get("xml:id"))] = {
                        "text": fig.figDesc.text.strip() if fig.figDesc else "",
                        "latex": None,
                        "type": "figure",
                        "content": "",
                        "fig_num": fig_num,
                    }
        except AttributeError:
            continue
        fig.decompose()

    return ref_map


def check_if_citations_are_bracket_style(sp: bs4.BeautifulSoup) -> bool:
    """
    Check if the document has bracket style citations
    :param sp:
    :return:
    """
    cite_strings = []
    if sp.body:
        for div in sp.body.find_all("div"):
            if div.head:
                continue
            for rtag in div.find_all("ref"):
                ref_type = rtag.get("type")
                if ref_type == "bibr":
                    cite_strings.append(rtag.text.strip())

        # check how many match bracket style
        bracket_style = [bool(BRACKET_REGEX.match(cite_str)) for cite_str in cite_strings]

        # return true if
        if sum(bracket_style) > BRACKET_STYLE_THRESHOLD:
            return True

    return False


def sub_all_note_tags(sp: bs4.BeautifulSoup) -> bs4.BeautifulSoup:
    """
    Sub all note tags with p tags
    :param para_el:
    :param sp:
    :return:
    """
    for ntag in sp.find_all("note"):
        p_tag = sp.new_tag("p")
        p_tag.string = ntag.text.strip()
        ntag.replace_with(p_tag)
    return sp


def process_formulas_in_paragraph(para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup) -> None:
    """
    Process all formulas in paragraph and replace with text and label
    :param para_el:
    :param sp:
    :return:
    """
    for ftag in para_el.find_all("formula"):
        # get label if exists and insert a space between formula and label
        if ftag.label:
            label = " " + ftag.label.text
            ftag.label.decompose()
        else:
            label = ""
        ftag.replace_with(sp.new_string(f"{ftag.text.strip()}{label}"))


def process_references_in_paragraph(
    para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup, refs: Dict
) -> Dict:
    """
    Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form)
    :param para_el:
    :param sp:
    :param refs:
    :return:
    """
    tokgen = UniqTokenGenerator("REFTOKEN")
    ref_dict = dict()
    for rtag in para_el.find_all("ref"):
        try:
            ref_type = rtag.get("type")
            # skip if citation
            if ref_type == "bibr":
                continue
            if ref_type == "table" or ref_type == "figure":
                ref_id = rtag.get("target")
                if ref_id and normalize_grobid_id(ref_id) in refs:
                    # normalize reference string
                    rtag_string = normalize_grobid_id(ref_id)
                else:
                    rtag_string = None
                # add to ref set
                ref_key = tokgen.next()
                ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type)
                rtag.replace_with(sp.new_string(f" {ref_key} "))
            else:
                # replace with surface form
                rtag.replace_with(sp.new_string(rtag.text.strip()))
        except AttributeError:
            continue
    return ref_dict


def process_citations_in_paragraph(
    para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup, bibs: Dict, bracket: bool
) -> Dict:
    """
    Process all citations in paragraph and generate a dict for surface forms
    :param para_el:
    :param sp:
    :param bibs:
    :param bracket:
    :return:
    """

    # CHECK if range between two surface forms is appropriate for bracket style expansion
    def _get_surface_range(start_surface, end_surface):
        span1_match = SINGLE_BRACKET_REGEX.match(start_surface)
        span2_match = SINGLE_BRACKET_REGEX.match(end_surface)
        if span1_match and span2_match:
            # get numbers corresponding to citations
            span1_num = int(span1_match.group(1))
            span2_num = int(span2_match.group(1))
            # expand if range is between 1 and 20
            if 1 < span2_num - span1_num < 20:
                return span1_num, span2_num
        return None

    # CREATE BIBREF range between two reference ids, e.g. BIBREF1-BIBREF4 -> BIBREF1 BIBREF2 BIBREF3 BIBREF4
    def _create_ref_id_range(start_ref_id, end_ref_id):
        start_ref_num = int(start_ref_id[6:])
        end_ref_num = int(end_ref_id[6:])
        return [f"BIBREF{curr_ref_num}" for curr_ref_num in range(start_ref_num, end_ref_num + 1)]

    # CREATE surface form range between two bracket strings, e.g. [1]-[4] -> [1] [2] [3] [4]
    def _create_surface_range(start_number, end_number):
        return [f"[{n}]" for n in range(start_number, end_number + 1)]

    # create citation dict with keywords
    cite_map = dict()
    tokgen = UniqTokenGenerator("CITETOKEN")

    for rtag in para_el.find_all("ref"):
        try:
            # get surface span, e.g. [3]
            surface_span = rtag.text.strip()

            # check if target is available (#b2 -> BID2)
            if rtag.get("target"):
                # normalize reference string
                rtag_ref_id = normalize_grobid_id(rtag.get("target"))

                # skip if rtag ref_id not in bibliography
                if rtag_ref_id not in bibs:
                    cite_key = tokgen.next()
                    rtag.replace_with(sp.new_string(f" {cite_key} "))
                    cite_map[cite_key] = (None, surface_span)
                    continue

                # if bracket style, only keep if surface form is bracket
                if bracket:
                    # valid bracket span
                    if surface_span and (
                        surface_span[0] == "["
                        or surface_span[-1] == "]"
                        or surface_span[-1] == ","
                    ):
                        pass
                    # invalid, replace tag with surface form and continue to next ref tag
                    else:
                        rtag.replace_with(sp.new_string(f" {surface_span} "))
                        continue
                # not bracket, add cite span and move on
                else:
                    cite_key = tokgen.next()
                    rtag.replace_with(sp.new_string(f" {cite_key} "))
                    cite_map[cite_key] = (rtag_ref_id, surface_span)
                    continue

                # EXTRA PROCESSING FOR BRACKET STYLE CITATIONS; EXPAND RANGES ###
                # look backward for range marker, e.g. [1]-*[3]*
                backward_between_span = ""
                for sib in rtag.previous_siblings:
                    if sib.name == "ref":
                        break
                    elif type(sib) is bs4.NavigableString:
                        backward_between_span += sib
                    else:
                        break

                # check if there's a backwards expansion, e.g. need to expand [1]-[3] -> [1] [2] [3]
                if is_expansion_string(backward_between_span):
                    # get surface number range
                    surface_num_range = _get_surface_range(
                        rtag.find_previous_sibling("ref").text.strip(), surface_span
                    )
                    # if the surface number range is reasonable (range < 20, in order), EXPAND
                    if surface_num_range:
                        # delete previous ref tag and anything in between (i.e. delete "-" and extra spaces)
                        for sib in rtag.previous_siblings:
                            if sib.name == "ref":
                                break
                            elif type(sib) is bs4.NavigableString:
                                sib.replace_with(sp.new_string(""))
                            else:
                                break

                        # get ref id of previous ref, e.g. [1] (#b0 -> BID0)
                        previous_rtag = rtag.find_previous_sibling("ref")
                        previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get("target"))
                        previous_rtag.decompose()

                        # replace this ref tag with the full range expansion, e.g. [3] (#b2 -> BID1 BID2)
                        id_range = _create_ref_id_range(previous_rtag_ref_id, rtag_ref_id)
                        surface_range = _create_surface_range(
                            surface_num_range[0], surface_num_range[1]
                        )
                        replace_string = ""
                        for range_ref_id, range_surface_form in zip(id_range, surface_range):
                            # only replace if ref id is in bibliography, else add none
                            if range_ref_id in bibs:
                                cite_key = tokgen.next()
                                cite_map[cite_key] = (range_ref_id, range_surface_form)
                            else:
                                cite_key = tokgen.next()
                                cite_map[cite_key] = (None, range_surface_form)
                            replace_string += cite_key + " "
                        rtag.replace_with(sp.new_string(f" {replace_string} "))
                    # ELSE do not expand backwards and replace previous and current rtag with appropriate ref id
                    else:
                        # add mapping between ref id and surface form for previous ref tag
                        previous_rtag = rtag.find_previous_sibling("ref")
                        previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get("target"))
                        previous_rtag_surface = previous_rtag.text.strip()
                        cite_key = tokgen.next()
                        previous_rtag.replace_with(sp.new_string(f" {cite_key} "))
                        cite_map[cite_key] = (
                            previous_rtag_ref_id,
                            previous_rtag_surface,
                        )

                        # add mapping between ref id and surface form for current reftag
                        cite_key = tokgen.next()
                        rtag.replace_with(sp.new_string(f" {cite_key} "))
                        cite_map[cite_key] = (rtag_ref_id, surface_span)
                else:
                    # look forward and see if expansion string, e.g. *[1]*-[3]
                    forward_between_span = ""
                    for sib in rtag.next_siblings:
                        if sib.name == "ref":
                            break
                        elif type(sib) is bs4.NavigableString:
                            forward_between_span += sib
                        else:
                            break
                    # look forward for range marker (if is a range, continue -- range will be expanded
                    # when we get to the second value)
                    if is_expansion_string(forward_between_span):
                        continue
                    # else treat like normal reference
                    else:
                        cite_key = tokgen.next()
                        rtag.replace_with(sp.new_string(f" {cite_key} "))
                        cite_map[cite_key] = (rtag_ref_id, surface_span)

            else:
                cite_key = tokgen.next()
                rtag.replace_with(sp.new_string(f" {cite_key} "))
                cite_map[cite_key] = (None, surface_span)
        except AttributeError:
            continue

    return cite_map


def process_paragraph(
    sp: bs4.BeautifulSoup,
    para_el: bs4.element.Tag,
    section_names: List[Tuple],
    bib_dict: Dict,
    ref_dict: Dict,
    bracket: bool,
) -> Dict:
    """
    Process one paragraph
    :param sp:
    :param para_el:
    :param section_names:
    :param bib_dict:
    :param ref_dict:
    :param bracket: if bracket style, expand and clean up citations
    :return:
    """
    # return empty paragraph if no text
    if not para_el.text:
        return {
            "text": "",
            "cite_spans": [],
            "ref_spans": [],
            "eq_spans": [],
            "section": section_names,
        }

    # replace formulas with formula text
    process_formulas_in_paragraph(para_el, sp)

    # get references to tables and figures
    ref_map = process_references_in_paragraph(para_el, sp, ref_dict)

    # generate citation map for paragraph element (keep only cite spans with bib entry or unlinked)
    cite_map = process_citations_in_paragraph(para_el, sp, bib_dict, bracket)

    # substitute space characters
    para_text = re.sub(r"\s+", " ", para_el.text)
    para_text = re.sub(r"\s", " ", para_text)

    # get all cite and ref spans
    all_spans_to_replace = []
    for span in re.finditer(r"(CITETOKEN\d+)", para_text):
        uniq_token = span.group()
        ref_id, surface_text = cite_map[uniq_token]
        all_spans_to_replace.append(
            (span.start(), span.start() + len(uniq_token), uniq_token, surface_text)
        )
    for span in re.finditer(r"(REFTOKEN\d+)", para_text):
        uniq_token = span.group()
        ref_id, surface_text, ref_type = ref_map[uniq_token]
        all_spans_to_replace.append(
            (span.start(), span.start() + len(uniq_token), uniq_token, surface_text)
        )

    # replace cite and ref spans and create json blobs
    para_text, all_spans_to_replace = sub_spans_and_update_indices(all_spans_to_replace, para_text)

    cite_span_blobs = [
        {"start": start, "end": end, "text": surface, "ref_id": cite_map[token][0]}
        for start, end, token, surface in all_spans_to_replace
        if token.startswith("CITETOKEN")
    ]

    ref_span_blobs = [
        {"start": start, "end": end, "text": surface, "ref_id": ref_map[token][0]}
        for start, end, token, surface in all_spans_to_replace
        if token.startswith("REFTOKEN")
    ]

    for cite_blob in cite_span_blobs:
        assert para_text[cite_blob["start"] : cite_blob["end"]] == cite_blob["text"]

    for ref_blob in ref_span_blobs:
        assert para_text[ref_blob["start"] : ref_blob["end"]] == ref_blob["text"]

    return {
        "text": para_text,
        "cite_spans": cite_span_blobs,
        "ref_spans": ref_span_blobs,
        "eq_spans": [],
        "section": section_names,
    }


def extract_abstract_from_tei_xml(
    sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
) -> List[Dict]:
    """
    Parse abstract from soup
    :param sp:
    :param bib_dict:
    :param ref_dict:
    :param cleanup_bracket:
    :return:
    """
    abstract_text = []
    if sp.abstract:
        # process all divs
        if sp.abstract.div:
            for div in sp.abstract.find_all("div"):
                if div.text:
                    if div.p:
                        for para in div.find_all("p"):
                            if para.text:
                                abstract_text.append(
                                    process_paragraph(
                                        sp,
                                        para,
                                        [(None, "Abstract")],
                                        bib_dict,
                                        ref_dict,
                                        cleanup_bracket,
                                    )
                                )
                    else:
                        if div.text:
                            abstract_text.append(
                                process_paragraph(
                                    sp,
                                    div,
                                    [(None, "Abstract")],
                                    bib_dict,
                                    ref_dict,
                                    cleanup_bracket,
                                )
                            )
        # process all paragraphs
        elif sp.abstract.p:
            for para in sp.abstract.find_all("p"):
                if para.text:
                    abstract_text.append(
                        process_paragraph(
                            sp,
                            para,
                            [(None, "Abstract")],
                            bib_dict,
                            ref_dict,
                            cleanup_bracket,
                        )
                    )
        # else just try to get the text
        else:
            if sp.abstract.text:
                abstract_text.append(
                    process_paragraph(
                        sp,
                        sp.abstract,
                        [(None, "Abstract")],
                        bib_dict,
                        ref_dict,
                        cleanup_bracket,
                    )
                )
        sp.abstract.decompose()
    return abstract_text


def extract_body_text_from_div(
    sp: bs4.BeautifulSoup,
    div: bs4.element.Tag,
    sections: List[Tuple],
    bib_dict: Dict,
    ref_dict: Dict,
    cleanup_bracket: bool,
) -> List[Dict]:
    """
    Parse body text from soup
    :param sp:
    :param div:
    :param sections:
    :param bib_dict:
    :param ref_dict:
    :param cleanup_bracket:
    :return:
    """
    chunks = []
    # check if nested divs; recursively process
    if div.div:
        for subdiv in div.find_all("div"):
            # has header, add to section list and process
            if subdiv.head:
                chunks += extract_body_text_from_div(
                    sp,
                    subdiv,
                    sections + [(subdiv.head.get("n", None), subdiv.head.text.strip())],
                    bib_dict,
                    ref_dict,
                    cleanup_bracket,
                )
                subdiv.head.decompose()
            # no header, process with same section list
            else:
                chunks += extract_body_text_from_div(
                    sp, subdiv, sections, bib_dict, ref_dict, cleanup_bracket
                )
    # process tags individuals
    for tag in div:
        try:
            if tag.name == "p":
                if tag.text:
                    chunks.append(
                        process_paragraph(sp, tag, sections, bib_dict, ref_dict, cleanup_bracket)
                    )
            elif tag.name == "formula":
                # e.g. <formula xml:id="formula_0">Y = W T X.<label>(1)</label></formula>
                label = tag.label.text
                tag.label.decompose()
                eq_text = tag.text
                chunks.append(
                    {
                        "text": "EQUATION",
                        "cite_spans": [],
                        "ref_spans": [],
                        "eq_spans": [
                            {
                                "start": 0,
                                "end": 8,
                                "text": "EQUATION",
                                "ref_id": "EQREF",
                                "raw_str": eq_text,
                                "eq_num": label,
                            }
                        ],
                        "section": sections,
                    }
                )
        except AttributeError:
            if tag.text:
                chunks.append(
                    process_paragraph(sp, tag, sections, bib_dict, ref_dict, cleanup_bracket)
                )

    return chunks


def extract_body_text_from_tei_xml(
    sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
) -> List[Dict]:
    """
    Parse body text from soup
    :param sp:
    :param bib_dict:
    :param ref_dict:
    :param cleanup_bracket:
    :return:
    """
    body_text = []
    if sp.body:
        body_text = extract_body_text_from_div(
            sp, sp.body, [], bib_dict, ref_dict, cleanup_bracket
        )
        sp.body.decompose()
    return body_text


def extract_back_matter_from_tei_xml(
    sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
) -> List[Dict]:
    """
    Parse back matter from soup
    :param sp:
    :param bib_dict:
    :param ref_dict:
    :param cleanup_bracket:
    :return:
    """
    back_text = []

    if sp.back:
        for div in sp.back.find_all("div"):
            if div.get("type"):
                section_type = div.get("type")
            else:
                section_type = ""

            for child_div in div.find_all("div"):
                if child_div.head:
                    section_title = child_div.head.text.strip()
                    section_num = child_div.head.get("n", None)
                    child_div.head.decompose()
                else:
                    section_title = section_type
                    section_num = None
                if child_div.text:
                    if child_div.text:
                        back_text.append(
                            process_paragraph(
                                sp,
                                child_div,
                                [(section_num, section_title)],
                                bib_dict,
                                ref_dict,
                                cleanup_bracket,
                            )
                        )
        sp.back.decompose()
    return back_text