File size: 6,521 Bytes
240e0a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
"""
去掉正文的引文引用marker
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
"""
import re
# from magic_pdf.libs.nlp_utils import NLPModels
# __NLP_MODEL = NLPModels()
def check_1(spans, cur_span_i):
"""寻找前一个char,如果是句号,逗号,那么就是角标"""
if cur_span_i==0:
return False # 不是角标
pre_span = spans[cur_span_i-1]
pre_char = pre_span['chars'][-1]['c']
if pre_char in ['。', ',', '.', ',']:
return True
return False
# def check_2(spans, cur_span_i):
# """检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标"""
# pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
#
# if cur_span_i==0 and len(spans)>1:
# next_span = spans[cur_span_i+1]
# next_txt = "".join([c['c'] for c in next_span['chars']])
# result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
# if result in ["PERSON", "GPE", "ORG"]:
# return True
#
# if re.findall(pattern, next_txt):
# return True
#
# return False # 不是角标
# elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除
# return False
#
# # 如果这个span是最后一个span,
# if cur_span_i==len(spans)-1:
# pre_span = spans[cur_span_i-1]
# pre_txt = "".join([c['c'] for c in pre_span['chars']])
# pre_word = pre_txt.split(' ')[-1]
# result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
# if result in ["PERSON", "GPE", "ORG"]:
# return True
#
# if re.findall(pattern, pre_txt):
# return True
#
# return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
# else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
# pre_span = spans[cur_span_i-1]
# next_span = spans[cur_span_i+1]
# cur_span = spans[cur_span_i]
# # 找到前一个和后一个span里的距离最近的单词
# pre_distance = 10000 # 一个很大的数
# next_distance = 10000 # 一个很大的数
# for c in pre_span['chars'][::-1]:
# if c['c'].isalpha():
# pre_distance = cur_span['bbox'][0] - c['bbox'][2]
# break
# for c in next_span['chars']:
# if c['c'].isalpha():
# next_distance = c['bbox'][0] - cur_span['bbox'][2]
# break
#
# if pre_distance<next_distance:
# belong_to_span = pre_span
# else:
# belong_to_span = next_span
#
# txt = "".join([c['c'] for c in belong_to_span['chars']])
# pre_word = txt.split(' ')[-1]
# result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
# if result in ["PERSON", "GPE", "ORG"]:
# return True
#
# if re.findall(pattern, txt):
# return True
#
# return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
def check_3(spans, cur_span_i):
"""上标里有[], 有*, 有-, 有逗号"""
# 如[2-3],[22]
# 如 2,3,4
cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip()
bad_char = ['[', ']', '*', ',']
if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt):
return True
# 如2-3, a-b
patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]']
for pattern in patterns:
match = re.match(pattern, cur_span_txt)
if match is not None:
return True
return False
def remove_citation_marker(with_char_text_blcoks):
for blk in with_char_text_blcoks:
for line in blk['lines']:
# 如果span里的个数少于2个,那只能忽略,角标不可能自己独占一行
if len(line['spans'])<=1:
continue
# 找到高度最高的span作为位置比较的基准
max_hi_span = line['spans'][0]['bbox']
min_font_sz = 10000 # line里最小的字体
max_font_sz = 0 # line里最大的字体
for s in line['spans']:
if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
max_hi_span = s['bbox']
if min_font_sz>s['size']:
min_font_sz = s['size']
if max_font_sz<s['size']:
max_font_sz = s['size']
base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
span_to_del = []
for i, span in enumerate(line['spans']):
span_hi = span['bbox'][3]-span['bbox'][1]
span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
span_font_sz = span['size']
if max_font_sz-span_font_sz<1: # 先以字体过滤正文,如果是正文就不再继续判断了
continue
if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
"""
1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式
2. 如果这个角标的前面是一个单词(长度大于5)而不是任何大写或小写的短字母的话 应该也是角标
3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了
4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标
"""
if (check_1(line['spans'], i) or
# check_2(line['spans'], i) or
check_3(line['spans'], i)
):
"""删除掉这个角标:删除这个span, 同时还要更新line的text"""
span_to_del.append(span)
if len(span_to_del)>0:
for span in span_to_del:
line['spans'].remove(span)
line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']])
return with_char_text_blcoks
|