File size: 2,766 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import re
from collections import Counter
from rapidfuzz import fuzz
from marker.schema.merged import FullyMergedBlock
from typing import List, Tuple
def filter_common_elements(lines, page_count, threshold=.6):
# We can't filter if we don't have enough pages to find common elements
if page_count < 3:
return []
text = [s.text for line in lines for s in line.spans if len(s.text) > 4]
counter = Counter(text)
common = [k for k, v in counter.items() if v > page_count * threshold]
bad_span_ids = [s.span_id for line in lines for s in line.spans if s.text in common]
return bad_span_ids
def filter_header_footer(all_page_blocks, max_selected_lines=2):
first_lines = []
last_lines = []
for page in all_page_blocks:
nonblank_lines = page.get_nonblank_lines()
first_lines.extend(nonblank_lines[:max_selected_lines])
last_lines.extend(nonblank_lines[-max_selected_lines:])
bad_span_ids = filter_common_elements(first_lines, len(all_page_blocks))
bad_span_ids += filter_common_elements(last_lines, len(all_page_blocks))
return bad_span_ids
def replace_leading_trailing_digits(string, replacement):
string = re.sub(r'^\d+', replacement, string)
string = re.sub(r'\d+$', replacement, string)
return string
def find_overlap_elements(lst: List[Tuple[str, int]], string_match_thresh=.9, min_overlap=.05) -> List[int]:
# Initialize a list to store the elements that meet the criteria
result = []
titles = [l[0] for l in lst]
for i, (str1, id_num) in enumerate(lst):
overlap_count = 0 # Count the number of elements that overlap by at least 80%
for j, str2 in enumerate(titles):
if i != j and fuzz.ratio(str1, str2) >= string_match_thresh * 100:
overlap_count += 1
# Check if the element overlaps with at least 50% of other elements
if overlap_count >= max(3.0, len(lst) * min_overlap):
result.append(id_num)
return result
def filter_common_titles(merged_blocks: List[FullyMergedBlock]) -> List[FullyMergedBlock]:
titles = []
for i, block in enumerate(merged_blocks):
if block.block_type in ["Title", "Section-header"]:
text = block.text
if text.strip().startswith("#"):
text = re.sub(r'#+', '', text)
text = text.strip()
# Remove page numbers from start/end
text = replace_leading_trailing_digits(text, "").strip()
titles.append((text, i))
bad_block_ids = find_overlap_elements(titles)
new_blocks = []
for i, block in enumerate(merged_blocks):
if i in bad_block_ids:
continue
new_blocks.append(block)
return new_blocks
|