File size: 2,766 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
from collections import Counter
from rapidfuzz import fuzz

from marker.schema.merged import FullyMergedBlock
from typing import List, Tuple


def filter_common_elements(lines, page_count, threshold=.6):
    # We can't filter if we don't have enough pages to find common elements
    if page_count < 3:
        return []
    text = [s.text for line in lines for s in line.spans if len(s.text) > 4]
    counter = Counter(text)
    common = [k for k, v in counter.items() if v > page_count * threshold]
    bad_span_ids = [s.span_id for line in lines for s in line.spans if s.text in common]
    return bad_span_ids


def filter_header_footer(all_page_blocks, max_selected_lines=2):
    first_lines = []
    last_lines = []
    for page in all_page_blocks:
        nonblank_lines = page.get_nonblank_lines()
        first_lines.extend(nonblank_lines[:max_selected_lines])
        last_lines.extend(nonblank_lines[-max_selected_lines:])

    bad_span_ids = filter_common_elements(first_lines, len(all_page_blocks))
    bad_span_ids += filter_common_elements(last_lines, len(all_page_blocks))
    return bad_span_ids


def replace_leading_trailing_digits(string, replacement):
    string = re.sub(r'^\d+', replacement, string)
    string = re.sub(r'\d+$', replacement, string)
    return string


def find_overlap_elements(lst: List[Tuple[str, int]], string_match_thresh=.9, min_overlap=.05) -> List[int]:
    # Initialize a list to store the elements that meet the criteria
    result = []
    titles = [l[0] for l in lst]

    for i, (str1, id_num) in enumerate(lst):
        overlap_count = 0  # Count the number of elements that overlap by at least 80%

        for j, str2 in enumerate(titles):
            if i != j and fuzz.ratio(str1, str2) >= string_match_thresh * 100:
                overlap_count += 1

        # Check if the element overlaps with at least 50% of other elements
        if overlap_count >= max(3.0, len(lst) * min_overlap):
            result.append(id_num)

    return result


def filter_common_titles(merged_blocks: List[FullyMergedBlock]) -> List[FullyMergedBlock]:
    titles = []
    for i, block in enumerate(merged_blocks):
        if block.block_type in ["Title", "Section-header"]:
            text = block.text
            if text.strip().startswith("#"):
                text = re.sub(r'#+', '', text)
            text = text.strip()
            # Remove page numbers from start/end
            text = replace_leading_trailing_digits(text, "").strip()
            titles.append((text, i))

    bad_block_ids = find_overlap_elements(titles)

    new_blocks = []
    for i, block in enumerate(merged_blocks):
        if i in bad_block_ids:
            continue
        new_blocks.append(block)

    return new_blocks