import spacy | |
from typing import List, Union | |
import logging | |
logger = logging.getLogger(__name__) | |
class NLPModel: | |
def __init__(self): | |
try: | |
# Load spaCy model only | |
self.nlp = spacy.load("pt_core_news_md") | |
logger.info("spaCy model initialized successfully") | |
except Exception as e: | |
logger.error(f"Failed to initialize spaCy model: {str(e)}") | |
raise | |
def extract_entities(self, text: Union[str, List[str]]) -> List[tuple]: | |
"""Entity extraction using spaCy""" | |
try: | |
if isinstance(text, list): | |
text = " ".join(text) | |
doc = self.nlp(text) | |
return [(ent.text.lower(), ent.label_) for ent in doc.ents] | |
except Exception as e: | |
logger.error(f"Entity extraction failed: {str(e)}") | |
return [] | |
def tokenize_sentences(self, text: str) -> List[str]: | |
"""Sentence tokenization using spaCy""" | |
try: | |
doc = self.nlp(text) | |
return [sent.text for sent in doc.sents] | |
except Exception as e: | |
logger.error(f"Sentence tokenization failed: {str(e)}") | |
return [text] # Fallback to returning whole text |