File size: 1,247 Bytes
4a86a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from keybert import KeyBERT
import spacy
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

kw_model = KeyBERT()
nlp = spacy.load("en_core_web_md")

def get_top_keywords(text, top_n=5):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english')
    
    seen = set()
    deduped_keywords = []
    
    for word, _ in keywords:
        simplified = word.lower().replace("-", " ").replace("_", " ")
        if simplified not in seen:
            seen.add(simplified)
            deduped_keywords.append(word)
        if len(deduped_keywords) >= top_n:
            break

    return deduped_keywords

def get_top_named_entities(text, top_n=15):
    doc = nlp(text)
    entities = [ent.text.strip() for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE", "EVENT", "PRODUCT", "LOC", "FAC"}]

    counter = Counter(entities)
    ranked_entities = [entity for entity, _ in counter.most_common(top_n)]
    
    return ranked_entities

def detect_events(text):
    keywords = get_top_keywords(text)
    named_entities = get_top_named_entities(text)
    
    return {
        "Top Keywords (KeyBERT)": keywords,
        "Top Named Entities (NER)": named_entities
    }