pmkhanh7890's picture
1st version of demo
1ce1659
raw
history blame
1.87 kB
from transformers import pipeline
ner_pipeline = pipeline("ner")
def extract_entities(text):
output = ner_pipeline(text)
words = extract_words(output)
words = combine_subwords(words)
# extract word in each entity and assign to a list of entities, connect words if there is no space between them
entities = []
for entity in words:
if entity not in entities:
entities.append(entity)
return entities
def extract_words(entities):
"""
Extracts the words from a list of entities.
Args:
entities: A list of entities.
Returns:
A list of words extracted from the entities.
"""
words = []
for entity in entities:
words.append(entity["word"])
return words
def combine_subwords(word_list):
"""
Combines subwords (indicated by "##") with the preceding word in a list.
Args:
word_list: A list of words, where subwords are prefixed with "##".
Returns:
A new list with subwords combined with their preceding words.
"""
result = []
i = 0
while i < len(word_list):
if word_list[i].startswith("##"):
result[-1] += word_list[i][2:] # Remove "##" and append to the previous word
elif i < len(word_list) - 2 and word_list[i + 1] == "-": # Combine hyphenated words
result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
i += 2 # Skip the next two words
else:
result.append(word_list[i])
i += 1
return result
if __name__ == "__main__":
text = "The Saudi authorities, I am told, are currently working flat out" \
"to collate everything they have on the Magdeburg market suspect," \
"Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \
"investigation"
print(extract_entities(text))