Spaces:

pmkhanh7890
/

news_verification

Sleeping

news_verification / src /texts /Search_Text /fake_text_generation_share.py

1st

22e1b62 4 months ago

1.49 kB

	from difflib import SequenceMatcher



	def highlight_overlap_by_word_to_list(text1, text2):
	"""
	trả về:
	- list of words in text1
	- list of words in text2
	- list of index of hight words in text 1
	- list of index of hight words in text 2
	"""
	# Tách chuỗi thành các từ (word) dựa vào khoảng trắng
	words1 = text1.split()
	words2 = text2.split()

	index1 = []
	index2 = []

	# Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
	matcher = SequenceMatcher(None, words1, words2)

	highlighted_text1 = []
	highlighted_text2 = []

	# Theo dõi vị trí hiện tại trong words1 và words2
	current_pos1 = 0
	current_pos2 = 0

	# Lặp qua các đoạn so khớp
	for match in matcher.get_matching_blocks():
	start1, start2, length = match

	# Thêm các từ không trùng lặp vào (giữ nguyên)
	highlighted_text1.extend(words1[current_pos1:start1])
	highlighted_text2.extend(words2[current_pos2:start2])

	if length > 0:
	for i in range(start1, start1 + length):
	index1.append(i)
	for i in range(start2, start2 + length):
	index2.append(i)


	# Cập nhật vị trí hiện tại
	current_pos1 = start1 + length
	current_pos2 = start2 + length

	return words1, words2, index1, index2


	if __name__ == "__main__":
	pass