Spaces:

ArneBinder
/

ScientificArgumentRecommender

Running

App Files Files Community

ScientificArgumentRecommender / src /utils /pdf_utils /utils.py

ArneBinder

update from https://github.com/ArneBinder/pie-document-level/pull/397

ced4316 verified 2 months ago

raw

history blame contribute delete

32.3 kB

	import re
	from typing import Dict, List, Tuple

	import bs4


	def replace_refspans(
	spans_to_replace: List[Tuple[int, int, str, str]],
	full_string: str,
	pre_padding: str = "",
	post_padding: str = "",
	btwn_padding: str = ", ",
	) -> str:
	"""
	For each span within the full string, replace that span with new text
	:param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring)
	:param full_string:
	:param pre_padding:
	:param post_padding:
	:param btwn_padding:
	:return:
	"""
	# assert all spans are equal to full_text span
	assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace])

	# assert none of the spans start with the same start ind
	start_inds = [rep[0] for rep in spans_to_replace]
	assert len(set(start_inds)) == len(start_inds)

	# sort by start index
	spans_to_replace.sort(key=lambda x: x[0])

	# form strings for each span group
	for i, entry in enumerate(spans_to_replace):
	start, end, span, new_string = entry

	# skip empties
	if end <= 0:
	continue

	# compute shift amount
	shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding)

	# shift remaining appropriately
	for ind in range(i + 1, len(spans_to_replace)):
	next_start, next_end, next_span, next_string = spans_to_replace[ind]
	# skip empties
	if next_end <= 0:
	continue
	# if overlap between ref span and current ref span, remove from replacement
	if next_start < end:
	next_start = 0
	next_end = 0
	next_string = ""
	# if ref span abuts previous reference span
	elif next_start == end:
	next_start += shift_amount
	next_end += shift_amount
	next_string = btwn_padding + pre_padding + next_string + post_padding
	# if ref span starts after, shift starts and ends
	elif next_start > end:
	next_start += shift_amount
	next_end += shift_amount
	next_string = pre_padding + next_string + post_padding
	# save adjusted span
	spans_to_replace[ind] = (next_start, next_end, next_span, next_string)

	spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0]
	spans_to_replace.sort(key=lambda x: x[0])

	# apply shifts in series
	for start, end, span, new_string in spans_to_replace:
	assert full_string[start:end] == span
	full_string = full_string[:start] + new_string + full_string[end:]

	return full_string


	BRACKET_REGEX = re.compile(r"\[[1-9]\d{0,2}([,;\-\s]+[1-9]\d{0,2})*;?\]")
	BRACKET_STYLE_THRESHOLD = 5

	SINGLE_BRACKET_REGEX = re.compile(r"\[([1-9]\d{0,2})\]")
	EXPANSION_CHARS = {"-", "–"}

	REPLACE_TABLE_TOKS = {
	"<row>": "<tr>",
	"<row/>": "<tr/>",
	"</row>": "</tr>",
	"<cell>": "<td>",
	"<cell/>": "<td/>",
	"</cell>": "</td>",
	"<cell ": "<td ",
	"cols=": "colspan=",
	}


	def span_already_added(sub_start: int, sub_end: int, span_indices: List[Tuple[int, int]]) -> bool:
	"""
	Check if span is a subspan of existing span
	:param sub_start:
	:param sub_end:
	:param span_indices:
	:return:
	"""
	for span_start, span_end in span_indices:
	if sub_start >= span_start and sub_end <= span_end:
	return True
	return False


	def is_expansion_string(between_string: str) -> bool:
	"""
	Check if the string between two refs is an expansion string
	:param between_string:
	:return:
	"""
	if (
	len(between_string) <= 2
	and any([c in EXPANSION_CHARS for c in between_string])
	and all([c in EXPANSION_CHARS.union({" "}) for c in between_string])
	):
	return True
	return False


	# TODO: still cases like `09bcee03baceb509d4fcf736fa1322cb8adf507f` w/ dups like ['L Jung', 'R Hessler', 'Louis Jung', 'Roland Hessler']
	# example paper that has empties & duplicates: `09bce26cc7e825e15a4469e3e78b7a54898bb97f`
	def _clean_empty_and_duplicate_authors_from_grobid_parse(
	authors: List[Dict],
	) -> List[Dict]:
	"""
	Within affiliation, `location` is a dict with fields <settlement>, <region>, <country>, <postCode>, etc.
	Too much hassle, so just take the first one that's not empty.
	"""
	# stripping empties
	clean_authors_list = []
	for author in authors:
	clean_first = author["first"].strip()
	clean_last = author["last"].strip()
	clean_middle = [m.strip() for m in author["middle"]]
	clean_suffix = author["suffix"].strip()
	if clean_first or clean_last or clean_middle:
	author["first"] = clean_first
	author["last"] = clean_last
	author["middle"] = clean_middle
	author["suffix"] = clean_suffix
	clean_authors_list.append(author)
	# combining duplicates (preserve first occurrence of author name as position)
	key_to_author_blobs = {}
	ordered_keys_by_author_pos = []
	for author in clean_authors_list:
	key = (
	author["first"],
	author["last"],
	" ".join(author["middle"]),
	author["suffix"],
	)
	if key not in key_to_author_blobs:
	key_to_author_blobs[key] = author
	ordered_keys_by_author_pos.append(key)
	else:
	if author["email"]:
	key_to_author_blobs[key]["email"] = author["email"]
	if author["affiliation"] and (
	author["affiliation"]["institution"]
	or author["affiliation"]["laboratory"]
	or author["affiliation"]["location"]
	):
	key_to_author_blobs[key]["affiliation"] = author["affiliation"]
	dedup_authors_list = [key_to_author_blobs[key] for key in ordered_keys_by_author_pos]
	return dedup_authors_list


	def sub_spans_and_update_indices(
	spans_to_replace: List[Tuple[int, int, str, str]], full_string: str
	) -> Tuple[str, List]:
	"""
	Replace all spans and recompute indices
	:param spans_to_replace:
	:param full_string:
	:return:
	"""
	# TODO: check no spans overlapping
	# TODO: check all spans well-formed

	# assert all spans are equal to full_text span
	assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace])

	# assert none of the spans start with the same start ind
	start_inds = [rep[0] for rep in spans_to_replace]
	assert len(set(start_inds)) == len(start_inds)

	# sort by start index
	spans_to_replace.sort(key=lambda x: x[0])

	# compute offsets for each span
	new_spans = [
	(start, end, token, surface, 0) for start, end, token, surface in spans_to_replace
	]
	for i, entry in enumerate(spans_to_replace):
	start, end, token, surface = entry
	new_end = start + len(surface)
	offset = new_end - end
	# new_spans[i][1] += offset
	new_spans[i] = (
	new_spans[i][0],
	new_spans[i][1] + offset,
	new_spans[i][2],
	new_spans[i][3],
	new_spans[i][4],
	)
	# for new_span_entry in new_spans[i + 1 :]:
	# new_span_entry[4] += offset
	for j in range(i + 1, len(new_spans)):
	new_spans[j] = (
	new_spans[j][0],
	new_spans[j][1],
	new_spans[j][2],
	new_spans[j][3],
	new_spans[j][4] + offset,
	)

	# generate new text and create final spans
	new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="")
	result = [
	(start + offset, end + offset, token, surface)
	for start, end, token, surface, offset in new_spans
	]

	return new_text, result


	class UniqTokenGenerator:
	"""
	Generate unique token
	"""

	def __init__(self, tok_string):
	self.tok_string = tok_string
	self.ind = 0

	def __iter__(self):
	return self

	def __next__(self):
	return self.next()

	def next(self):
	new_token = f"{self.tok_string}{self.ind}"
	self.ind += 1
	return new_token


	def normalize_grobid_id(grobid_id: str):
	"""
	Normalize grobid object identifiers
	:param grobid_id:
	:return:
	"""
	str_norm = grobid_id.upper().replace("_", "").replace("#", "")
	if str_norm.startswith("B"):
	return str_norm.replace("B", "BIBREF")
	if str_norm.startswith("TAB"):
	return str_norm.replace("TAB", "TABREF")
	if str_norm.startswith("FIG"):
	return str_norm.replace("FIG", "FIGREF")
	if str_norm.startswith("FORMULA"):
	return str_norm.replace("FORMULA", "EQREF")
	return str_norm


	def extract_formulas_from_tei_xml(sp: bs4.BeautifulSoup) -> None:
	"""
	Replace all formulas with the text
	:param sp:
	:return:
	"""
	for eq in sp.find_all("formula"):
	eq.replace_with(sp.new_string(eq.text.strip()))


	def table_to_html(table: bs4.element.Tag) -> str:
	"""
	Sub table tags with html table tags
	:param table_str:
	:return:
	"""
	for tag in table:
	if tag.name != "row":
	print(f"Unknown table subtag: {tag.name}")
	tag.decompose()
	table_str = str(table)
	for token, subtoken in REPLACE_TABLE_TOKS.items():
	table_str = table_str.replace(token, subtoken)
	return table_str


	def extract_figures_and_tables_from_tei_xml(sp: bs4.BeautifulSoup) -> Dict[str, Dict]:
	"""
	Generate figure and table dicts
	:param sp:
	:return:
	"""
	ref_map = dict()

	for fig in sp.find_all("figure"):
	try:
	if fig.name and fig.get("xml:id"):
	if fig.get("type") == "table":
	ref_map[normalize_grobid_id(fig.get("xml:id"))] = {
	"text": (
	fig.figDesc.text.strip()
	if fig.figDesc
	else fig.head.text.strip() if fig.head else ""
	),
	"latex": None,
	"type": "table",
	"content": table_to_html(fig.table),
	"fig_num": fig.get("xml:id"),
	}
	else:
	if True in [char.isdigit() for char in fig.findNext("head").findNext("label")]:
	fig_num = fig.findNext("head").findNext("label").contents[0]
	else:
	fig_num = None
	ref_map[normalize_grobid_id(fig.get("xml:id"))] = {
	"text": fig.figDesc.text.strip() if fig.figDesc else "",
	"latex": None,
	"type": "figure",
	"content": "",
	"fig_num": fig_num,
	}
	except AttributeError:
	continue
	fig.decompose()

	return ref_map


	def check_if_citations_are_bracket_style(sp: bs4.BeautifulSoup) -> bool:
	"""
	Check if the document has bracket style citations
	:param sp:
	:return:
	"""
	cite_strings = []
	if sp.body:
	for div in sp.body.find_all("div"):
	if div.head:
	continue
	for rtag in div.find_all("ref"):
	ref_type = rtag.get("type")
	if ref_type == "bibr":
	cite_strings.append(rtag.text.strip())

	# check how many match bracket style
	bracket_style = [bool(BRACKET_REGEX.match(cite_str)) for cite_str in cite_strings]

	# return true if
	if sum(bracket_style) > BRACKET_STYLE_THRESHOLD:
	return True

	return False


	def sub_all_note_tags(sp: bs4.BeautifulSoup) -> bs4.BeautifulSoup:
	"""
	Sub all note tags with p tags
	:param para_el:
	:param sp:
	:return:
	"""
	for ntag in sp.find_all("note"):
	p_tag = sp.new_tag("p")
	p_tag.string = ntag.text.strip()
	ntag.replace_with(p_tag)
	return sp


	def process_formulas_in_paragraph(para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup) -> None:
	"""
	Process all formulas in paragraph and replace with text and label
	:param para_el:
	:param sp:
	:return:
	"""
	for ftag in para_el.find_all("formula"):
	# get label if exists and insert a space between formula and label
	if ftag.label:
	label = " " + ftag.label.text
	ftag.label.decompose()
	else:
	label = ""
	ftag.replace_with(sp.new_string(f"{ftag.text.strip()}{label}"))


	def process_references_in_paragraph(
	para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup, refs: Dict
	) -> Dict:
	"""
	Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form)
	:param para_el:
	:param sp:
	:param refs:
	:return:
	"""
	tokgen = UniqTokenGenerator("REFTOKEN")
	ref_dict = dict()
	for rtag in para_el.find_all("ref"):
	try:
	ref_type = rtag.get("type")
	# skip if citation
	if ref_type == "bibr":
	continue
	if ref_type == "table" or ref_type == "figure":
	ref_id = rtag.get("target")
	if ref_id and normalize_grobid_id(ref_id) in refs:
	# normalize reference string
	rtag_string = normalize_grobid_id(ref_id)
	else:
	rtag_string = None
	# add to ref set
	ref_key = tokgen.next()
	ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type)
	rtag.replace_with(sp.new_string(f" {ref_key} "))
	else:
	# replace with surface form
	rtag.replace_with(sp.new_string(rtag.text.strip()))
	except AttributeError:
	continue
	return ref_dict


	def process_citations_in_paragraph(
	para_el: bs4.BeautifulSoup, sp: bs4.BeautifulSoup, bibs: Dict, bracket: bool
	) -> Dict:
	"""
	Process all citations in paragraph and generate a dict for surface forms
	:param para_el:
	:param sp:
	:param bibs:
	:param bracket:
	:return:
	"""

	# CHECK if range between two surface forms is appropriate for bracket style expansion
	def _get_surface_range(start_surface, end_surface):
	span1_match = SINGLE_BRACKET_REGEX.match(start_surface)
	span2_match = SINGLE_BRACKET_REGEX.match(end_surface)
	if span1_match and span2_match:
	# get numbers corresponding to citations
	span1_num = int(span1_match.group(1))
	span2_num = int(span2_match.group(1))
	# expand if range is between 1 and 20
	if 1 < span2_num - span1_num < 20:
	return span1_num, span2_num
	return None

	# CREATE BIBREF range between two reference ids, e.g. BIBREF1-BIBREF4 -> BIBREF1 BIBREF2 BIBREF3 BIBREF4
	def _create_ref_id_range(start_ref_id, end_ref_id):
	start_ref_num = int(start_ref_id[6:])
	end_ref_num = int(end_ref_id[6:])
	return [f"BIBREF{curr_ref_num}" for curr_ref_num in range(start_ref_num, end_ref_num + 1)]

	# CREATE surface form range between two bracket strings, e.g. [1]-[4] -> [1] [2] [3] [4]
	def _create_surface_range(start_number, end_number):
	return [f"[{n}]" for n in range(start_number, end_number + 1)]

	# create citation dict with keywords
	cite_map = dict()
	tokgen = UniqTokenGenerator("CITETOKEN")

	for rtag in para_el.find_all("ref"):
	try:
	# get surface span, e.g. [3]
	surface_span = rtag.text.strip()

	# check if target is available (#b2 -> BID2)
	if rtag.get("target"):
	# normalize reference string
	rtag_ref_id = normalize_grobid_id(rtag.get("target"))

	# skip if rtag ref_id not in bibliography
	if rtag_ref_id not in bibs:
	cite_key = tokgen.next()
	rtag.replace_with(sp.new_string(f" {cite_key} "))
	cite_map[cite_key] = (None, surface_span)
	continue

	# if bracket style, only keep if surface form is bracket
	if bracket:
	# valid bracket span
	if surface_span and (
	surface_span[0] == "["
	or surface_span[-1] == "]"
	or surface_span[-1] == ","
	):
	pass
	# invalid, replace tag with surface form and continue to next ref tag
	else:
	rtag.replace_with(sp.new_string(f" {surface_span} "))
	continue
	# not bracket, add cite span and move on
	else:
	cite_key = tokgen.next()
	rtag.replace_with(sp.new_string(f" {cite_key} "))
	cite_map[cite_key] = (rtag_ref_id, surface_span)
	continue

	# EXTRA PROCESSING FOR BRACKET STYLE CITATIONS; EXPAND RANGES ###
	# look backward for range marker, e.g. [1]-[3]
	backward_between_span = ""
	for sib in rtag.previous_siblings:
	if sib.name == "ref":
	break
	elif type(sib) is bs4.NavigableString:
	backward_between_span += sib
	else:
	break

	# check if there's a backwards expansion, e.g. need to expand [1]-[3] -> [1] [2] [3]
	if is_expansion_string(backward_between_span):
	# get surface number range
	surface_num_range = _get_surface_range(
	rtag.find_previous_sibling("ref").text.strip(), surface_span
	)
	# if the surface number range is reasonable (range < 20, in order), EXPAND
	if surface_num_range:
	# delete previous ref tag and anything in between (i.e. delete "-" and extra spaces)
	for sib in rtag.previous_siblings:
	if sib.name == "ref":
	break
	elif type(sib) is bs4.NavigableString:
	sib.replace_with(sp.new_string(""))
	else:
	break

	# get ref id of previous ref, e.g. [1] (#b0 -> BID0)
	previous_rtag = rtag.find_previous_sibling("ref")
	previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get("target"))
	previous_rtag.decompose()

	# replace this ref tag with the full range expansion, e.g. [3] (#b2 -> BID1 BID2)
	id_range = _create_ref_id_range(previous_rtag_ref_id, rtag_ref_id)
	surface_range = _create_surface_range(
	surface_num_range[0], surface_num_range[1]
	)
	replace_string = ""
	for range_ref_id, range_surface_form in zip(id_range, surface_range):
	# only replace if ref id is in bibliography, else add none
	if range_ref_id in bibs:
	cite_key = tokgen.next()
	cite_map[cite_key] = (range_ref_id, range_surface_form)
	else:
	cite_key = tokgen.next()
	cite_map[cite_key] = (None, range_surface_form)
	replace_string += cite_key + " "
	rtag.replace_with(sp.new_string(f" {replace_string} "))
	# ELSE do not expand backwards and replace previous and current rtag with appropriate ref id
	else:
	# add mapping between ref id and surface form for previous ref tag
	previous_rtag = rtag.find_previous_sibling("ref")
	previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get("target"))
	previous_rtag_surface = previous_rtag.text.strip()
	cite_key = tokgen.next()
	previous_rtag.replace_with(sp.new_string(f" {cite_key} "))
	cite_map[cite_key] = (
	previous_rtag_ref_id,
	previous_rtag_surface,
	)

	# add mapping between ref id and surface form for current reftag
	cite_key = tokgen.next()
	rtag.replace_with(sp.new_string(f" {cite_key} "))
	cite_map[cite_key] = (rtag_ref_id, surface_span)
	else:
	# look forward and see if expansion string, e.g. [1]-[3]
	forward_between_span = ""
	for sib in rtag.next_siblings:
	if sib.name == "ref":
	break
	elif type(sib) is bs4.NavigableString:
	forward_between_span += sib
	else:
	break
	# look forward for range marker (if is a range, continue -- range will be expanded
	# when we get to the second value)
	if is_expansion_string(forward_between_span):
	continue
	# else treat like normal reference
	else:
	cite_key = tokgen.next()
	rtag.replace_with(sp.new_string(f" {cite_key} "))
	cite_map[cite_key] = (rtag_ref_id, surface_span)

	else:
	cite_key = tokgen.next()
	rtag.replace_with(sp.new_string(f" {cite_key} "))
	cite_map[cite_key] = (None, surface_span)
	except AttributeError:
	continue

	return cite_map


	def process_paragraph(
	sp: bs4.BeautifulSoup,
	para_el: bs4.element.Tag,
	section_names: List[Tuple],
	bib_dict: Dict,
	ref_dict: Dict,
	bracket: bool,
	) -> Dict:
	"""
	Process one paragraph
	:param sp:
	:param para_el:
	:param section_names:
	:param bib_dict:
	:param ref_dict:
	:param bracket: if bracket style, expand and clean up citations
	:return:
	"""
	# return empty paragraph if no text
	if not para_el.text:
	return {
	"text": "",
	"cite_spans": [],
	"ref_spans": [],
	"eq_spans": [],
	"section": section_names,
	}

	# replace formulas with formula text
	process_formulas_in_paragraph(para_el, sp)

	# get references to tables and figures
	ref_map = process_references_in_paragraph(para_el, sp, ref_dict)

	# generate citation map for paragraph element (keep only cite spans with bib entry or unlinked)
	cite_map = process_citations_in_paragraph(para_el, sp, bib_dict, bracket)

	# substitute space characters
	para_text = re.sub(r"\s+", " ", para_el.text)
	para_text = re.sub(r"\s", " ", para_text)

	# get all cite and ref spans
	all_spans_to_replace = []
	for span in re.finditer(r"(CITETOKEN\d+)", para_text):
	uniq_token = span.group()
	ref_id, surface_text = cite_map[uniq_token]
	all_spans_to_replace.append(
	(span.start(), span.start() + len(uniq_token), uniq_token, surface_text)
	)
	for span in re.finditer(r"(REFTOKEN\d+)", para_text):
	uniq_token = span.group()
	ref_id, surface_text, ref_type = ref_map[uniq_token]
	all_spans_to_replace.append(
	(span.start(), span.start() + len(uniq_token), uniq_token, surface_text)
	)

	# replace cite and ref spans and create json blobs
	para_text, all_spans_to_replace = sub_spans_and_update_indices(all_spans_to_replace, para_text)

	cite_span_blobs = [
	{"start": start, "end": end, "text": surface, "ref_id": cite_map[token][0]}
	for start, end, token, surface in all_spans_to_replace
	if token.startswith("CITETOKEN")
	]

	ref_span_blobs = [
	{"start": start, "end": end, "text": surface, "ref_id": ref_map[token][0]}
	for start, end, token, surface in all_spans_to_replace
	if token.startswith("REFTOKEN")
	]

	for cite_blob in cite_span_blobs:
	assert para_text[cite_blob["start"] : cite_blob["end"]] == cite_blob["text"]

	for ref_blob in ref_span_blobs:
	assert para_text[ref_blob["start"] : ref_blob["end"]] == ref_blob["text"]

	return {
	"text": para_text,
	"cite_spans": cite_span_blobs,
	"ref_spans": ref_span_blobs,
	"eq_spans": [],
	"section": section_names,
	}


	def extract_abstract_from_tei_xml(
	sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
	) -> List[Dict]:
	"""
	Parse abstract from soup
	:param sp:
	:param bib_dict:
	:param ref_dict:
	:param cleanup_bracket:
	:return:
	"""
	abstract_text = []
	if sp.abstract:
	# process all divs
	if sp.abstract.div:
	for div in sp.abstract.find_all("div"):
	if div.text:
	if div.p:
	for para in div.find_all("p"):
	if para.text:
	abstract_text.append(
	process_paragraph(
	sp,
	para,
	[(None, "Abstract")],
	bib_dict,
	ref_dict,
	cleanup_bracket,
	)
	)
	else:
	if div.text:
	abstract_text.append(
	process_paragraph(
	sp,
	div,
	[(None, "Abstract")],
	bib_dict,
	ref_dict,
	cleanup_bracket,
	)
	)
	# process all paragraphs
	elif sp.abstract.p:
	for para in sp.abstract.find_all("p"):
	if para.text:
	abstract_text.append(
	process_paragraph(
	sp,
	para,
	[(None, "Abstract")],
	bib_dict,
	ref_dict,
	cleanup_bracket,
	)
	)
	# else just try to get the text
	else:
	if sp.abstract.text:
	abstract_text.append(
	process_paragraph(
	sp,
	sp.abstract,
	[(None, "Abstract")],
	bib_dict,
	ref_dict,
	cleanup_bracket,
	)
	)
	sp.abstract.decompose()
	return abstract_text


	def extract_body_text_from_div(
	sp: bs4.BeautifulSoup,
	div: bs4.element.Tag,
	sections: List[Tuple],
	bib_dict: Dict,
	ref_dict: Dict,
	cleanup_bracket: bool,
	) -> List[Dict]:
	"""
	Parse body text from soup
	:param sp:
	:param div:
	:param sections:
	:param bib_dict:
	:param ref_dict:
	:param cleanup_bracket:
	:return:
	"""
	chunks = []
	# check if nested divs; recursively process
	if div.div:
	for subdiv in div.find_all("div"):
	# has header, add to section list and process
	if subdiv.head:
	chunks += extract_body_text_from_div(
	sp,
	subdiv,
	sections + [(subdiv.head.get("n", None), subdiv.head.text.strip())],
	bib_dict,
	ref_dict,
	cleanup_bracket,
	)
	subdiv.head.decompose()
	# no header, process with same section list
	else:
	chunks += extract_body_text_from_div(
	sp, subdiv, sections, bib_dict, ref_dict, cleanup_bracket
	)
	# process tags individuals
	for tag in div:
	try:
	if tag.name == "p":
	if tag.text:
	chunks.append(
	process_paragraph(sp, tag, sections, bib_dict, ref_dict, cleanup_bracket)
	)
	elif tag.name == "formula":
	# e.g. <formula xml:id="formula_0">Y = W T X.<label>(1)</label></formula>
	label = tag.label.text
	tag.label.decompose()
	eq_text = tag.text
	chunks.append(
	{
	"text": "EQUATION",
	"cite_spans": [],
	"ref_spans": [],
	"eq_spans": [
	{
	"start": 0,
	"end": 8,
	"text": "EQUATION",
	"ref_id": "EQREF",
	"raw_str": eq_text,
	"eq_num": label,
	}
	],
	"section": sections,
	}
	)
	except AttributeError:
	if tag.text:
	chunks.append(
	process_paragraph(sp, tag, sections, bib_dict, ref_dict, cleanup_bracket)
	)

	return chunks


	def extract_body_text_from_tei_xml(
	sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
	) -> List[Dict]:
	"""
	Parse body text from soup
	:param sp:
	:param bib_dict:
	:param ref_dict:
	:param cleanup_bracket:
	:return:
	"""
	body_text = []
	if sp.body:
	body_text = extract_body_text_from_div(
	sp, sp.body, [], bib_dict, ref_dict, cleanup_bracket
	)
	sp.body.decompose()
	return body_text


	def extract_back_matter_from_tei_xml(
	sp: bs4.BeautifulSoup, bib_dict: Dict, ref_dict: Dict, cleanup_bracket: bool
	) -> List[Dict]:
	"""
	Parse back matter from soup
	:param sp:
	:param bib_dict:
	:param ref_dict:
	:param cleanup_bracket:
	:return:
	"""
	back_text = []

	if sp.back:
	for div in sp.back.find_all("div"):
	if div.get("type"):
	section_type = div.get("type")
	else:
	section_type = ""

	for child_div in div.find_all("div"):
	if child_div.head:
	section_title = child_div.head.text.strip()
	section_num = child_div.head.get("n", None)
	child_div.head.decompose()
	else:
	section_title = section_type
	section_num = None
	if child_div.text:
	if child_div.text:
	back_text.append(
	process_paragraph(
	sp,
	child_div,
	[(section_num, section_title)],
	bib_dict,
	ref_dict,
	cleanup_bracket,
	)
	)
	sp.back.decompose()
	return back_text