File size: 13,227 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
from magic_pdf.libs.commons import fitz             # pyMuPDF库
import re

from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox             # json


## version 2
def get_merged_line(page):
    """
    这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线,并且将断开的线段进行了合并。
    :param page :fitz读取的当前页的内容
    """
    drawings_bbox = []
    drawings_line = []
    drawings = page.get_drawings()  # 提取所有的矢量
    for p in drawings:
        drawings_bbox.append(p["rect"].irect)  # (L, U, R, D)

    lines = []
    for L, U, R, D in drawings_bbox:
        if abs(D - U) <= 3: # 筛出水平的横线
            lines.append((L, U, R, D))
    U_groups = []
    visited = [False for _ in range(len(lines))]
    for i, (L1, U1, R1, D1) in enumerate(lines):
        if visited[i] == True:
            continue
        tmp_g = [(L1, U1, R1, D1)]
        for j, (L2, U2, R2, D2) in enumerate(lines):
            if i == j:
                continue
            if visited[j] == True:
                continue
            if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5:   # 把高度一致的线放进一个group
                tmp_g.append((L2, U2, R2, D2))
                visited[j] = True
        U_groups.append(tmp_g)
        
    res = []
    for group in U_groups:
        group.sort(key = lambda LURD: (LURD[0], LURD[2]))
        LL, UU, RR, DD = group[0]
        for i, (L1, U1, R1, D1) in enumerate(group):
            if (L1 - RR) >= 5:
                cur_line = (LL, UU, RR, DD)
                res.append(cur_line)
                LL = L1
            else:
                RR = max(RR, R1)
        cur_line = (LL, UU, RR, DD)
        res.append(cur_line)
    return res

def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
    """
    :param page :fitz读取的当前页的内容
    :param table_bboxes: list类型,每一个元素是一个元祖 (L, U, R, D)
    :param include_table_title: 是否将表格的标题也圈进来
    :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
    """
    
    drawings_lines = get_merged_line(page)
    fix_table_bboxes = []
    
    for table in table_bboxes:
        (L, U, R, D) = table
        fix_table_L = []
        fix_table_U = []
        fix_table_R = []
        fix_table_D = []
        width = R - L
        width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
        height = D - U
        height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
        for line in drawings_lines:
            if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
                if (U - height_range) < line[1] < (U + height_range): # 上边界,在一定的高度范围内
                    fix_table_U.append(line[1])
                    fix_table_L.append(line[0])
                    fix_table_R.append(line[2])
                elif (D - height_range) < line[1] < (D + height_range): # 下边界,在一定的高度范围内
                    fix_table_D.append(line[1])
                    fix_table_L.append(line[0])
                    fix_table_R.append(line[2])

        if fix_table_U:
            U = min(fix_table_U)
        if fix_table_D:
            D = max(fix_table_D)
        if fix_table_L:
            L = min(fix_table_L)
        if fix_table_R:
            R = max(fix_table_R)
            
        if include_table_title:   # 需要将表格标题包括
            text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]   # 所有的text的block
            incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))]  # 将与表格完全没有任何遮挡的文字筛除掉(比如另一栏的文字)
            upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0]  # 将在表格线以上的text block筛选出来
            sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序,如果是同一个高度,则先左再右
            
            for idx in range(scan_line_num):   
                if idx+1 <= len(sorted_filtered_text_blocks):
                    line_temp = sorted_filtered_text_blocks[idx]['lines']
                    if line_temp:
                        text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
                        check_en = re.match('Table', text) # 检查是否有Table开头的(英文)
                        check_ch = re.match('表', text) # 检查是否有Table开头的(中文)
                        if check_en or check_ch:
                            if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
                                U = sorted_filtered_text_blocks[idx]['bbox'][1]
                                  
        fix_table_bboxes.append([L-2, U-2, R+2, D+2])
    
    return fix_table_bboxes

def __check_table_title_pattern(text):
    """
    检查文本段是否是表格的标题
    """
    patterns = [r'^table\s\d+']
    
    for pattern in patterns:
        match = re.match(pattern, text, re.IGNORECASE)
        if match:
            return True
        else:
            return False
         
         
def fix_table_text_block(pymu_blocks, table_bboxes: list):
    """
    调整table, 如果table和上下的text block有相交区域,则将table的上下边界调整到text block的上下边界
    例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
    """
    for tb in table_bboxes:
        (L, U, R, D) = tb
        for block in pymu_blocks:
            if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
                txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
                if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title,那么不调整。因为下一步会统一调整,如果这里进行了调整,后面的调整会造成调整到其他table的title上(在连续出现2个table的情况下)。
                    tb[0] = min(tb[0], block['bbox'][0])
                    tb[1] = min(tb[1], block['bbox'][1])
                    tb[2] = max(tb[2], block['bbox'][2])
                    tb[3] = max(tb[3], block['bbox'][3])
                    block['_table'] = True # 占位,防止其他table再次占用
                    
                """如果是个table的title,但是有部分重叠,那么修正这个title,使得和table不重叠"""
                if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
                    block['bbox'] = list(block['bbox'])
                    if block['bbox'][3] > U:
                        block['bbox'][3] = U-1
                    if block['bbox'][1] < D:
                        block['bbox'][1] = D+1
                
                
    return table_bboxes


def __get_table_caption_text(text_block):
    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
    line_cnt = len(text_block['lines'])
    txt = txt.replace("Ž . ", '')
    return txt, line_cnt


def include_table_title(pymu_blocks, table_bboxes: list):
    """
    把表格的title也包含进来,扩展到table_bbox上
    """
    for tb in table_bboxes:
        max_find_cnt = 3 # 上上最多找3次
        temp_box = tb.copy()
        while max_find_cnt>0:
            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
            if text_block_top:
                txt, line_cnt = __get_table_caption_text(text_block_top)
                if len(txt.strip())>0:
                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
                        max_find_cnt = max_find_cnt -1
                        temp_box[1] = text_block_top['bbox'][1]
                        continue
                    else:
                        break
                else:
                    temp_box[1] = text_block_top['bbox'][1] # 宽度不变,扩大
                    max_find_cnt = max_find_cnt - 1
            else:
                break
            
        max_find_cnt = 3 # 向下找
        temp_box = tb.copy()
        while max_find_cnt>0:
            text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
            if text_block_bottom:
                txt, line_cnt = __get_table_caption_text(text_block_bottom)
                if len(txt.strip())>0:
                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
                        max_find_cnt = max_find_cnt - 1
                        temp_box[3] = text_block_bottom['bbox'][3]
                        continue
                    else:
                        break
                else:
                    temp_box[3] = text_block_bottom['bbox'][3]
                    max_find_cnt = max_find_cnt - 1
            else:
                break
        
        if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
            btn_text, _ = __get_table_caption_text(text_block_bottom)
            top_text, _ = __get_table_caption_text(text_block_top)
            if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
                # 取距离最近的
                btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
                top_text_distance = tb[1] - text_block_top['bbox'][3]
                text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
                tb[0] = min(tb[0], text_block['bbox'][0])
                tb[1] = min(tb[1], text_block['bbox'][1])
                tb[2] = max(tb[2], text_block['bbox'][2])
                tb[3] = max(tb[3], text_block['bbox'][3])
                text_block_bottom['_table_caption'] = True
                continue

        # 如果以上条件都不满足,那么就向下找
        text_block = text_block_top
        if text_block and text_block.get("_table_caption", False) is False:
            first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
                tb[0] = min(tb[0], text_block['bbox'][0])
                tb[1] = min(tb[1], text_block['bbox'][1])
                tb[2] = max(tb[2], text_block['bbox'][2])
                tb[3] = max(tb[3], text_block['bbox'][3])
                text_block['_table_caption'] = True
                continue
            
        text_block = text_block_bottom
        if text_block and text_block.get("_table_caption", False) is False:
            first_text_line, _ = __get_table_caption_text(text_block)
            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
                tb[0] = min(tb[0], text_block['bbox'][0])
                tb[1] = min(tb[1], text_block['bbox'][1])
                tb[2] = max(tb[2], text_block['bbox'][2])
                tb[3] = max(tb[3], text_block['bbox'][3])
                text_block['_table_caption'] = True
                continue
        
        """向左、向右寻找,暂时只寻找一次"""
        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
        if left_text_block and left_text_block.get("_image_caption", False) is False:
            first_text_line, _ = __get_table_caption_text(left_text_block)
            if __check_table_title_pattern(first_text_line):
                tb[0] = min(tb[0], left_text_block['bbox'][0])
                tb[1] = min(tb[1], left_text_block['bbox'][1])
                tb[2] = max(tb[2], left_text_block['bbox'][2])
                tb[3] = max(tb[3], left_text_block['bbox'][3])
                left_text_block['_image_caption'] = True
                continue
            
        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
        if right_text_block and right_text_block.get("_image_caption", False) is False:
            first_text_line, _ = __get_table_caption_text(right_text_block)
            if __check_table_title_pattern(first_text_line):
                tb[0] = min(tb[0], right_text_block['bbox'][0])
                tb[1] = min(tb[1], right_text_block['bbox'][1])
                tb[2] = max(tb[2], right_text_block['bbox'][2])
                tb[3] = max(tb[3], right_text_block['bbox'][3])
                right_text_block['_image_caption'] = True
                continue
                
    return table_bboxes