Spaces:
Running
Running
File size: 1,247 Bytes
4a86a4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from keybert import KeyBERT
import spacy
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
kw_model = KeyBERT()
nlp = spacy.load("en_core_web_md")
def get_top_keywords(text, top_n=5):
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english')
seen = set()
deduped_keywords = []
for word, _ in keywords:
simplified = word.lower().replace("-", " ").replace("_", " ")
if simplified not in seen:
seen.add(simplified)
deduped_keywords.append(word)
if len(deduped_keywords) >= top_n:
break
return deduped_keywords
def get_top_named_entities(text, top_n=15):
doc = nlp(text)
entities = [ent.text.strip() for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE", "EVENT", "PRODUCT", "LOC", "FAC"}]
counter = Counter(entities)
ranked_entities = [entity for entity, _ in counter.most_common(top_n)]
return ranked_entities
def detect_events(text):
keywords = get_top_keywords(text)
named_entities = get_top_named_entities(text)
return {
"Top Keywords (KeyBERT)": keywords,
"Top Named Entities (NER)": named_entities
}
|