test / models /nlp.py
christopher
Removed NLTK-related functionality
e21244d
import spacy
from typing import List, Union
import logging
logger = logging.getLogger(__name__)
class NLPModel:
def __init__(self):
try:
# Load spaCy model only
self.nlp = spacy.load("pt_core_news_md")
logger.info("spaCy model initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize spaCy model: {str(e)}")
raise
def extract_entities(self, text: Union[str, List[str]]) -> List[tuple]:
"""Entity extraction using spaCy"""
try:
if isinstance(text, list):
text = " ".join(text)
doc = self.nlp(text)
return [(ent.text.lower(), ent.label_) for ent in doc.ents]
except Exception as e:
logger.error(f"Entity extraction failed: {str(e)}")
return []
def tokenize_sentences(self, text: str) -> List[str]:
"""Sentence tokenization using spaCy"""
try:
doc = self.nlp(text)
return [sent.text for sent in doc.sents]
except Exception as e:
logger.error(f"Sentence tokenization failed: {str(e)}")
return [text] # Fallback to returning whole text