christopher commited on
Commit
e21244d
·
1 Parent(s): e67b064

Removed NLTK-related functionality

Browse files
Files changed (2) hide show
  1. database/query_processor.py +13 -18
  2. models/nlp.py +31 -18
database/query_processor.py CHANGED
@@ -13,7 +13,8 @@ class QueryProcessor:
13
  self.summarization_model = summarization_model
14
  self.nlp_model = nlp_model
15
  self.db_service = db_service
16
-
 
17
  async def process(
18
  self,
19
  query: str,
@@ -22,37 +23,31 @@ class QueryProcessor:
22
  end_date: Optional[str] = None
23
  ) -> Dict[str, Any]:
24
  try:
25
- # Validate and parse dates
26
  start_dt = self._parse_date(start_date) if start_date else None
27
  end_dt = self._parse_date(end_date) if end_date else None
28
 
29
- # Get query embedding
30
  query_embedding = self.embedding_model.encode(query).tolist()
31
- logger.debug(f"Query embedding generated for: {query[:50]}...")
32
-
33
- # Extract entities safely
34
- entities = self._extract_entities_safely(query)
35
- logger.debug(f"Extracted entities: {entities}")
36
-
37
- # Semantic search
38
- articles = await self._execute_semantic_search(
39
  query_embedding,
40
  start_dt,
41
  end_dt,
42
  topic,
43
- entities
44
  )
45
 
46
  if not articles:
47
- logger.info("No articles found matching criteria")
48
  return {"message": "No articles found", "articles": []}
49
 
50
- # Process results and generate summary
51
- summary_result = self._generate_summary(articles)
52
-
53
  return {
54
- "summary": summary_result["summary"],
55
- "key_sentences": summary_result["key_sentences"],
56
  "articles": articles,
57
  "entities": entities
58
  }
 
13
  self.summarization_model = summarization_model
14
  self.nlp_model = nlp_model
15
  self.db_service = db_service
16
+ logger.info("QueryProcessor initialized")
17
+
18
  async def process(
19
  self,
20
  query: str,
 
23
  end_date: Optional[str] = None
24
  ) -> Dict[str, Any]:
25
  try:
26
+ # Date handling
27
  start_dt = self._parse_date(start_date) if start_date else None
28
  end_dt = self._parse_date(end_date) if end_date else None
29
 
30
+ # Query processing
31
  query_embedding = self.embedding_model.encode(query).tolist()
32
+ entities = self.nlp_model.extract_entities(query)
33
+
34
+ # Database search
35
+ articles = await self._execute_search(
 
 
 
 
36
  query_embedding,
37
  start_dt,
38
  end_dt,
39
  topic,
40
+ [ent[0] for ent in entities]
41
  )
42
 
43
  if not articles:
 
44
  return {"message": "No articles found", "articles": []}
45
 
46
+ # Summary generation
47
+ summary_data = self._generate_summary(articles)
 
48
  return {
49
+ "summary": summary_data["summary"],
50
+ "key_sentences": summary_data["key_sentences"],
51
  "articles": articles,
52
  "entities": entities
53
  }
models/nlp.py CHANGED
@@ -1,22 +1,35 @@
1
  import spacy
2
- import nltk
 
 
 
3
 
4
  class NLPModel:
5
  def __init__(self):
6
- self.nlp = spacy.load("pt_core_news_md")
7
- nltk.download('punkt')
8
-
9
- def __call__(self, text: str):
10
- """Makes the model callable like model(text)."""
11
- return self.extract_entities(text) # or another default method
12
-
13
- def extract_entities(self, text: str):
14
- """Ensure this always takes a string and returns entities"""
15
- if isinstance(text, list): # If accidentally passed a list
16
- text = " ".join(text) # Combine into single string
17
- doc = self.nlp(text)
18
- return [(ent.text.lower(), ent.label_) for ent in doc.ents]
19
-
20
-
21
- def tokenize_sentences(self, text: str):
22
- return nltk.sent_tokenize(text)
 
 
 
 
 
 
 
 
 
 
 
1
  import spacy
2
+ from typing import List, Union
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
 
7
  class NLPModel:
8
  def __init__(self):
9
+ try:
10
+ # Load spaCy model only
11
+ self.nlp = spacy.load("pt_core_news_md")
12
+ logger.info("spaCy model initialized successfully")
13
+ except Exception as e:
14
+ logger.error(f"Failed to initialize spaCy model: {str(e)}")
15
+ raise
16
+
17
+ def extract_entities(self, text: Union[str, List[str]]) -> List[tuple]:
18
+ """Entity extraction using spaCy"""
19
+ try:
20
+ if isinstance(text, list):
21
+ text = " ".join(text)
22
+ doc = self.nlp(text)
23
+ return [(ent.text.lower(), ent.label_) for ent in doc.ents]
24
+ except Exception as e:
25
+ logger.error(f"Entity extraction failed: {str(e)}")
26
+ return []
27
+
28
+ def tokenize_sentences(self, text: str) -> List[str]:
29
+ """Sentence tokenization using spaCy"""
30
+ try:
31
+ doc = self.nlp(text)
32
+ return [sent.text for sent in doc.sents]
33
+ except Exception as e:
34
+ logger.error(f"Sentence tokenization failed: {str(e)}")
35
+ return [text] # Fallback to returning whole text