Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /identity.py

pmkhanh7890

1st version of demo

1ce1659 4 months ago

raw

history blame

1.87 kB

	from transformers import pipeline

	ner_pipeline = pipeline("ner")

	def extract_entities(text):
	output = ner_pipeline(text)
	words = extract_words(output)
	words = combine_subwords(words)

	# extract word in each entity and assign to a list of entities, connect words if there is no space between them
	entities = []
	for entity in words:
	if entity not in entities:
	entities.append(entity)

	return entities


	def extract_words(entities):
	"""
	Extracts the words from a list of entities.

	Args:
	entities: A list of entities.

	Returns:
	A list of words extracted from the entities.
	"""
	words = []
	for entity in entities:
	words.append(entity["word"])
	return words


	def combine_subwords(word_list):
	"""
	Combines subwords (indicated by "##") with the preceding word in a list.

	Args:
	word_list: A list of words, where subwords are prefixed with "##".

	Returns:
	A new list with subwords combined with their preceding words.
	"""
	result = []
	i = 0
	while i < len(word_list):
	if word_list[i].startswith("##"):
	result[-1] += word_list[i][2:] # Remove "##" and append to the previous word
	elif i < len(word_list) - 2 and word_list[i + 1] == "-": # Combine hyphenated words
	result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
	i += 2 # Skip the next two words
	else:
	result.append(word_list[i])
	i += 1
	return result

	if __name__ == "__main__":
	text = "The Saudi authorities, I am told, are currently working flat out" \
	"to collate everything they have on the Magdeburg market suspect," \
	"Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \
	"investigation"
	print(extract_entities(text))