Spaces:
Sleeping
Sleeping
from difflib import SequenceMatcher | |
import string | |
def extract_equal_text(text1, text2): | |
def cleanup(text): | |
text = text.lower() | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
return text | |
splited_text1 = cleanup(text1).split() | |
splited_text2 = cleanup(text2).split() | |
s = SequenceMatcher(None, splited_text1, splited_text2) | |
equal_idx_1 = [] | |
equal_idx_2 = [] | |
text1 = text1.split() | |
text2 = text2.split() | |
for tag, i1, i2, j1, j2 in s.get_opcodes(): | |
if tag == 'equal': | |
equal_idx_1.append({"start": i1, "end": i2}) | |
equal_idx_2.append({"start": j1, "end": j2}) | |
subtext_1 = " ".join(text1[i1:i2]) | |
subtext_2 = " ".join(text2[j1:j2]) | |
print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] {subtext_1!r:>55} --> {subtext_2!r}') | |
return equal_idx_1, equal_idx_2 | |
text1 = """ | |
Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m. | |
Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe. | |
He made a substitute appearance and waved farewell to fans in Newcastle's recent win against Southampton. | |
Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2022-23, and scored against Paris St-Germain in the Champions League. | |
""" | |
text2 = """ | |
Newcastle United winger Miguel Almiron has rejoined Atlanta United on a permanent deal for £8m. | |
Almiron has made 223 appearances for Newcastle, scoring 30 goals, but has struggled recently to gain a place in manager Eddie Howe's starting line-up. | |
Last weekend he came on as a substitute in Newcastle's 3-1 win against Southampton and waved farewell to the travelling supporters. | |
Almiron played a significant role in Newcastle reaching the Carabao Cup final and finishing fourth in the Premier League in 2022-23. | |
""" | |
idx_1, idx_2 = extract_equal_text(text1, text2) | |
# text1_split = text1.split() | |
# for idx in idx_1: | |
# print(text1_split[idx["start"]:idx["end"]]) |