Spaces:
Sleeping
Sleeping
from transformers import pipeline | |
ner_pipeline = pipeline("ner") | |
def extract_entities(text): | |
output = ner_pipeline(text) | |
words = extract_words(output) | |
words = combine_subwords(words) | |
# extract word in each entity and assign to a list of entities, connect words if there is no space between them | |
entities = [] | |
for entity in words: | |
if entity not in entities: | |
entities.append(entity) | |
return entities | |
def extract_words(entities): | |
""" | |
Extracts the words from a list of entities. | |
Args: | |
entities: A list of entities. | |
Returns: | |
A list of words extracted from the entities. | |
""" | |
words = [] | |
for entity in entities: | |
words.append(entity["word"]) | |
return words | |
def combine_subwords(word_list): | |
""" | |
Combines subwords (indicated by "##") with the preceding word in a list. | |
Args: | |
word_list: A list of words, where subwords are prefixed with "##". | |
Returns: | |
A new list with subwords combined with their preceding words. | |
""" | |
result = [] | |
i = 0 | |
while i < len(word_list): | |
if word_list[i].startswith("##"): | |
result[-1] += word_list[i][2:] # Remove "##" and append to the previous word | |
elif i < len(word_list) - 2 and word_list[i + 1] == "-": # Combine hyphenated words | |
result.append(word_list[i] + word_list[i + 1] + word_list[i + 2]) | |
i += 2 # Skip the next two words | |
else: | |
result.append(word_list[i]) | |
i += 1 | |
return result | |
if __name__ == "__main__": | |
text = "The Saudi authorities, I am told, are currently working flat out" \ | |
"to collate everything they have on the Magdeburg market suspect," \ | |
"Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \ | |
"investigation" | |
print(extract_entities(text)) |