from transformers import pipeline ner_pipeline = pipeline("ner") def extract_entities(text): output = ner_pipeline(text) words = extract_words(output) words = combine_subwords(words) # extract word in each entity and assign to a list of entities, connect words if there is no space between them entities = [] for entity in words: if entity not in entities: entities.append(entity) return entities def extract_words(entities): """ Extracts the words from a list of entities. Args: entities: A list of entities. Returns: A list of words extracted from the entities. """ words = [] for entity in entities: words.append(entity["word"]) return words def combine_subwords(word_list): """ Combines subwords (indicated by "##") with the preceding word in a list. Args: word_list: A list of words, where subwords are prefixed with "##". Returns: A new list with subwords combined with their preceding words. """ result = [] i = 0 while i < len(word_list): if word_list[i].startswith("##"): result[-1] += word_list[i][2:] # Remove "##" and append to the previous word elif i < len(word_list) - 2 and word_list[i + 1] == "-": # Combine hyphenated words result.append(word_list[i] + word_list[i + 1] + word_list[i + 2]) i += 2 # Skip the next two words else: result.append(word_list[i]) i += 1 return result if __name__ == "__main__": text = "The Saudi authorities, I am told, are currently working flat out" \ "to collate everything they have on the Magdeburg market suspect," \ "Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \ "investigation" print(extract_entities(text))