|
|
|
from __future__ import annotations |
|
|
|
import re |
|
import evaluate |
|
import pandas as pd |
|
|
|
print(f"loading: {__file__}") |
|
|
|
pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}") |
|
pattern_text_repetitions = re.compile( |
|
r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def del_non_word_char_repetition(text, debug=False): |
|
count = 0 |
|
|
|
if isinstance(text, str): |
|
if debug: |
|
print("----detect non-word characters repetition----") |
|
count = len(text) |
|
text = pattern_non_word_char_repetition.sub("\t", text) |
|
count -= len(text) |
|
if debug and count: |
|
print(f"removed non-word characters repetition: {count}") |
|
return text, count |
|
|
|
|
|
|
|
def detect_text_repetitions(text, debug=False): |
|
count = 0 |
|
|
|
if isinstance(text, str): |
|
if debug: |
|
print("----detect text repetitions----") |
|
matches = pattern_text_repetitions.finditer(text) |
|
for match in matches: |
|
if debug: |
|
print(match) |
|
for groupNum in range(0, len(match.groups())): |
|
groupNum = groupNum + 1 |
|
print( |
|
"Group {groupNum} found at {start}-{end}: `{group}`".format( |
|
groupNum=groupNum, |
|
start=match.start(groupNum), |
|
end=match.end(groupNum), |
|
group=match.group(groupNum), |
|
) |
|
) |
|
|
|
start, end = match.span() |
|
count += end - start - len(match.group(1)) |
|
|
|
return count |
|
|
|
|
|
def detect_repetitions(text, debug=False): |
|
if isinstance(text, str) is False: |
|
return 0, 0, 0 |
|
text, count_non_word_char_repetition = del_non_word_char_repetition( |
|
text, debug=debug |
|
) |
|
count_text_repetitions = detect_text_repetitions(text, debug=debug) |
|
total_repetitions = count_non_word_char_repetition + count_text_repetitions |
|
|
|
result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions) |
|
|
|
if debug: |
|
print(result) |
|
return result |
|
|
|
|
|
def calc_perf_scores(predictions, references, debug=False): |
|
bleu = evaluate.load("bleu") |
|
rouge = evaluate.load("rouge") |
|
bert_score = evaluate.load("bertscore") |
|
|
|
if debug: |
|
print("predictions:", predictions) |
|
print("references:", references) |
|
|
|
bleu_scores = bleu.compute( |
|
predictions=predictions, references=references, max_order=1 |
|
) |
|
rouge_scores = rouge.compute(predictions=predictions, references=references) |
|
bert_scores = bert_score.compute( |
|
predictions=predictions, |
|
references=references, |
|
lang="en", |
|
model_type="microsoft/deberta-large-mnli", |
|
) |
|
result = { |
|
"bleu_scores": bleu_scores, |
|
"rouge_scores": rouge_scores, |
|
"bert_scores": bert_scores, |
|
} |
|
|
|
if debug: |
|
print("result:", result) |
|
|
|
return result |
|
|