Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /preprocessing.py

pmkhanh7890

Add comments to text module

0827f9d 3 months ago

raw

history blame

1.87 kB

	"""
	Author: Khanh Phan
	Date: 2024-12-04
	"""

	from nltk.tokenize import sent_tokenize


	# TODO: consider moving to helpers
	def split_into_sentences(input_text: str) -> list[str]:
	"""
	Splits input text into sentences by newlines
	and then tokenizes each paragraph into sentences.

	Args:
	input_text (str): The input text as a string.

	Returns:
	list: A list of sentences.
	Returns an empty list if input is not a string.
	"""
	if not isinstance(input_text, str):
	return []

	# Split the input text into paragraphs based on newline characters,
	# keeping the newline characters.
	paragraphs = input_text.splitlines(keepends=True)
	sentences = []
	for paragraph in paragraphs:
	# Remove leading/trailing whitespace
	paragraph = paragraph.strip()

	if paragraph and paragraph != "\n":
	# Tokenize the paragraph into sentences
	sentences.extend(sent_tokenize(paragraph))

	return sentences


	def split_into_paragraphs(input_text: str) -> list[str]:
	"""
	Splits input text into paragraphs based on newline characters.

	Args:
	input_text (str): The input text as a string.

	Returns:
	list: A list of paragraphs.
	Returns an empty list if input is not a string.
	"""
	if not isinstance(input_text, str):
	return []

	# Split the input text into paragraphs based on newline characters,
	# keeping the newline characters.
	paragraphs = input_text.splitlines(keepends=True)
	out_paragraphs = []

	for paragraph in paragraphs:
	# Remove leading/trailing whitespace
	paragraph = paragraph.strip()

	if paragraph and paragraph != "\n":
	# Append the cleaned paragraph to the output list.
	out_paragraphs.append(paragraph)

	return out_paragraphs