christopher commited on
Commit
e67b064
·
1 Parent(s): c8d57fb

Added safe processing in query processor

Browse files
Files changed (1) hide show
  1. database/query_processor.py +98 -40
database/query_processor.py CHANGED
@@ -1,8 +1,9 @@
1
  import datetime
2
- from typing import List, Dict, Any, Optional
3
  import numpy as np
4
  from models.LexRank import degree_centrality_scores
5
  import logging
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
@@ -21,62 +22,119 @@ class QueryProcessor:
21
  end_date: Optional[str] = None
22
  ) -> Dict[str, Any]:
23
  try:
24
- # Convert string dates to datetime objects
25
- start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
26
- end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None
27
 
28
  # Get query embedding
29
  query_embedding = self.embedding_model.encode(query).tolist()
30
- logger.debug(f"Generated query embedding for: {query[:50]}...")
31
 
32
- # Extract entities using the NLP model
33
- entities = self.nlp_model.extract_entities(query) # Changed from direct call to using method
34
  logger.debug(f"Extracted entities: {entities}")
35
 
36
- # Semantic search with entities
37
- articles = await self.db_service.semantic_search(
38
- query_embedding=query_embedding,
39
- start_date=start_dt,
40
- end_date=end_dt,
41
- topic=topic,
42
- entities=[ent[0] for ent in entities] # Using just the entity texts
43
  )
44
 
45
  if not articles:
46
- logger.info("No articles found matching search criteria")
47
- return {"error": "No articles found matching the criteria"}
 
 
 
48
 
49
- # Process results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  contents = [article["content"] for article in articles]
51
  sentences = []
 
52
  for content in contents:
53
- sentences.extend(self.nlp_model.tokenize_sentences(content))
 
54
 
55
- logger.debug(f"Processing {len(sentences)} sentences for summarization")
56
-
57
- # Generate summary
58
- if sentences:
59
- embeddings = self.embedding_model.encode(sentences)
60
- similarity_matrix = np.inner(embeddings, embeddings)
61
- centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
62
-
63
- top_indices = np.argsort(-centrality_scores)[0:10]
64
- key_sentences = [sentences[idx].strip() for idx in top_indices]
65
- combined_text = ' '.join(key_sentences)
66
-
67
- summary = self.summarization_model.summarize(combined_text)
68
- logger.debug(f"Generated summary with {len(key_sentences)} key sentences")
69
- else:
70
- key_sentences = []
71
- summary = "No content available for summarization"
72
  logger.warning("No sentences available for summarization")
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  return {
75
- "summary": summary,
76
- "articles": articles,
77
- "entities": entities # Include extracted entities in response
78
  }
79
 
80
  except Exception as e:
81
- logger.error(f"Error in QueryProcessor: {str(e)}", exc_info=True)
82
- return {"error": f"Processing error: {str(e)}"}
 
 
 
 
1
  import datetime
2
+ from typing import List, Dict, Any, Optional, Tuple
3
  import numpy as np
4
  from models.LexRank import degree_centrality_scores
5
  import logging
6
+ from datetime import datetime as dt
7
 
8
  logger = logging.getLogger(__name__)
9
 
 
22
  end_date: Optional[str] = None
23
  ) -> Dict[str, Any]:
24
  try:
25
+ # Validate and parse dates
26
+ start_dt = self._parse_date(start_date) if start_date else None
27
+ end_dt = self._parse_date(end_date) if end_date else None
28
 
29
  # Get query embedding
30
  query_embedding = self.embedding_model.encode(query).tolist()
31
+ logger.debug(f"Query embedding generated for: {query[:50]}...")
32
 
33
+ # Extract entities safely
34
+ entities = self._extract_entities_safely(query)
35
  logger.debug(f"Extracted entities: {entities}")
36
 
37
+ # Semantic search
38
+ articles = await self._execute_semantic_search(
39
+ query_embedding,
40
+ start_dt,
41
+ end_dt,
42
+ topic,
43
+ entities
44
  )
45
 
46
  if not articles:
47
+ logger.info("No articles found matching criteria")
48
+ return {"message": "No articles found", "articles": []}
49
+
50
+ # Process results and generate summary
51
+ summary_result = self._generate_summary(articles)
52
 
53
+ return {
54
+ "summary": summary_result["summary"],
55
+ "key_sentences": summary_result["key_sentences"],
56
+ "articles": articles,
57
+ "entities": entities
58
+ }
59
+
60
+ except Exception as e:
61
+ logger.error(f"Processing failed: {str(e)}", exc_info=True)
62
+ return {"error": str(e)}
63
+
64
+ def _parse_date(self, date_str: str) -> dt:
65
+ """Safe date parsing with validation"""
66
+ try:
67
+ return dt.strptime(date_str, "%Y-%m-%d")
68
+ except ValueError as e:
69
+ logger.error(f"Invalid date format: {date_str}")
70
+ raise ValueError(f"Invalid date format. Expected YYYY-MM-DD, got {date_str}")
71
+
72
+ def _extract_entities_safely(self, text: str) -> List[Tuple[str, str]]:
73
+ """Robust entity extraction handling both strings and lists"""
74
+ try:
75
+ if isinstance(text, list):
76
+ logger.warning("Received list input for entity extraction, joining to string")
77
+ text = " ".join(text)
78
+ return self.nlp_model.extract_entities(text)
79
+ except Exception as e:
80
+ logger.error(f"Entity extraction failed: {str(e)}")
81
+ return []
82
+
83
+ async def _execute_semantic_search(
84
+ self,
85
+ query_embedding: List[float],
86
+ start_date: Optional[dt],
87
+ end_date: Optional[dt],
88
+ topic: Optional[str],
89
+ entities: List[Tuple[str, str]]
90
+ ) -> List[Dict[str, Any]]:
91
+ """Execute search with proper error handling"""
92
+ try:
93
+ entity_texts = [ent[0] for ent in entities]
94
+ return await self.db_service.semantic_search(
95
+ query_embedding=query_embedding,
96
+ start_date=start_date,
97
+ end_date=end_date,
98
+ topic=topic,
99
+ entities=entity_texts
100
+ )
101
+ except Exception as e:
102
+ logger.error(f"Semantic search failed: {str(e)}")
103
+ raise
104
+
105
+ def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
106
+ """Generate summary from articles with fallback handling"""
107
+ try:
108
  contents = [article["content"] for article in articles]
109
  sentences = []
110
+
111
  for content in contents:
112
+ if content:
113
+ sentences.extend(self.nlp_model.tokenize_sentences(content))
114
 
115
+ if not sentences:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  logger.warning("No sentences available for summarization")
117
+ return {
118
+ "summary": "No content available for summarization",
119
+ "key_sentences": []
120
+ }
121
+
122
+ embeddings = self.embedding_model.encode(sentences)
123
+ similarity_matrix = np.inner(embeddings, embeddings)
124
+ centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
125
+
126
+ top_indices = np.argsort(-centrality_scores)[:10]
127
+ key_sentences = [sentences[idx].strip() for idx in top_indices]
128
+ combined_text = ' '.join(key_sentences)
129
 
130
  return {
131
+ "summary": self.summarization_model.summarize(combined_text),
132
+ "key_sentences": key_sentences
 
133
  }
134
 
135
  except Exception as e:
136
+ logger.error(f"Summary generation failed: {str(e)}")
137
+ return {
138
+ "summary": "Summary generation failed",
139
+ "key_sentences": []
140
+ }