Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /helper.py

pmkhanh7890

pre-commit

bfe6692 3 months ago

raw

history blame

5.74 kB

	import re
	import string
	from collections import Counter
	from difflib import SequenceMatcher

	from nltk.tokenize import word_tokenize
	from nltk.util import ngrams
	from sklearn.feature_extraction.text import TfidfVectorizer


	def clean_text(text):
	"""Doc cleaning"""
	# exclude , and . due to number
	punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{\|}~"""

	# Lowering text
	text = text.lower()

	# Removing punctuation
	text = "".join([c for c in text if c not in punctuations])

	# Removing whitespace and newlines
	text = re.sub(r"\s+", " ", text)

	text.replace("£", " * ")

	words = text.split()
	text = " ".join(words[:18]) # Join the first 18 words back into a string

	return text


	def remove_punctuation(text):
	"""Remove punctuation from a given text."""
	punctuation_without_dot = string.punctuation.replace(".", "")
	translator = str.maketrans("", "", punctuation_without_dot)
	return text.translate(translator)


	def get_keywords(text, num_keywords=5):
	"""Return top k keywords from a doc using TF-IDF method"""

	# Create a TF-IDF Vectorizer
	vectorizer = TfidfVectorizer(stop_words="english")

	# Fit and transform the text
	tfidf_matrix = vectorizer.fit_transform([text])

	# Get feature names (words)
	feature_names = vectorizer.get_feature_names_out()

	# Get TF-IDF scores
	tfidf_scores = tfidf_matrix.toarray()[0]

	# Sort words by TF-IDF score
	word_scores = list(zip(feature_names, tfidf_scores))
	word_scores.sort(key=lambda x: x[1], reverse=True)

	# Return top keywords
	return [word for word, score in word_scores[:num_keywords]]


	def get_important_sentences(
	paragraph: str,
	keywords: list[str],
	num_sentences: int = 3,
	) -> list[str]:
	"""
	Selects important sentences based on a list of keywords.

	Args:
	paragraph (str): The input paragraph.
	keywords (list[str]): List of important keywords.
	num_sentences (int): Number of sentences to return (default is 3).

	Returns:
	list: A list of important sentences.
	"""
	# Clean and split the paragraph into sentences
	sentences = [
	s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
	]

	# Calculate the importance score for each sentence
	sentence_scores = []
	for sentence in sentences:
	processed_sentence = clean_text(sentence)
	score = 0
	words = processed_sentence.lower().split()
	word_count = Counter(words)

	for keyword in keywords:
	if keyword.lower() in word_count:
	score += word_count[keyword.lower()]

	sentence_scores.append((sentence, score))

	# Sort sentences by their scores in descending order
	sentence_scores.sort(key=lambda x: x[1], reverse=True)

	# Return the top N sentences
	return [sentence for sentence, score in sentence_scores[:num_sentences]]


	def extract_important_phrases(
	paragraph: str,
	keywords: list[str],
	phrase_length: int = 5,
	) -> list[str]:
	"""
	Extracts important phrases based on a list of keywords.
	Phrase length is auto-determined, and overlapped parts are less than 20%.

	Args:
	paragraph (str): The input paragraph.
	keywords (list[str]): List of important keywords.
	phrase_length (int): Length of phrases to extract (default: 5 words).

	Returns:
	list: A list of important phrases.
	"""
	# Tokenize the paragraph into words
	words = word_tokenize(paragraph.lower())

	# Determine phrase length (between 3 and 7 words)
	phrase_length = min(max(len(words) // 10, 5), 7)

	# Generate n-grams (phrases) from the paragraph
	phrases = list(ngrams(words, phrase_length))

	important_phrases = []
	used_indices = set()

	for i, phrase in enumerate(phrases):
	# Check if the phrase contains any keyword
	if any(keyword.lower() in phrase for keyword in keywords):
	# Check overlap with previously selected phrases
	if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
	important_phrases.append(clean_text(" ".join(phrase)))
	used_indices.add(i)

	return important_phrases


	def extract_equal_text(text1, text2):
	def cleanup(text):
	text = text.lower()
	text = text.translate(str.maketrans("", "", string.punctuation))
	return text

	splited_text1 = cleanup(text1).split()
	splited_text2 = cleanup(text2).split()

	s = SequenceMatcher(None, splited_text1, splited_text2)

	equal_idx_1 = []
	equal_idx_2 = []
	text1 = text1.split()
	text2 = text2.split()
	for tag, i1, i2, j1, j2 in s.get_opcodes():
	if tag == "equal":
	equal_idx_1.append({"start": i1, "end": i2})
	equal_idx_2.append({"start": j1, "end": j2})
	# subtext_1 = " ".join(text1[i1:i2])
	# subtext_2 = " ".join(text2[j1:j2])
	# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}]
	# {subtext_1!r:>55} --> {subtext_2!r}')
	return equal_idx_1, equal_idx_2


	def connect_consecutive_indexes(nums):
	"""
	Connects consecutive integers in a list.

	Args:
	nums: A list of integers.

	Returns:
	A list of lists, where each inner list represents a consecutive range.
	"""

	if not nums: # Handle empty input
	return []

	result = []
	start = nums[0]
	end = nums[0]

	for i in range(1, len(nums)):
	if nums[i] == end + 1:
	end = nums[i]
	else:
	result.append([start, end])
	start = nums[i]
	end = nums[i]

	result.append([start, end]) # Add the last range
	return result