christopher commited on
Commit
0424ce2
·
1 Parent(s): 21f3f8a

reverted query processor

Browse files
Files changed (1) hide show
  1. database/query_processor.py +22 -16
database/query_processor.py CHANGED
@@ -29,28 +29,22 @@ class QueryProcessor:
29
 
30
  # Query processing
31
  query_embedding = self.embedding_model.encode(query).tolist()
32
- logger.debug(f"Generated embedding for query: {query[:50]}...")
33
-
34
- # Entity extraction
35
  entities = self.nlp_model.extract_entities(query)
36
- logger.debug(f"Extracted entities: {entities}")
37
 
38
  # Database search
39
- articles = await self._execute_semantic_search(
40
  query_embedding,
41
  start_dt,
42
  end_dt,
43
  topic,
44
- [ent[0] for ent in entities] # Just the entity texts
45
  )
46
 
47
  if not articles:
48
- logger.info("No articles found matching criteria")
49
  return {"message": "No articles found", "articles": []}
50
 
51
  # Summary generation
52
  summary_data = self._generate_summary(articles)
53
-
54
  return {
55
  "summary": summary_data["summary"],
56
  "key_sentences": summary_data["key_sentences"],
@@ -70,22 +64,34 @@ class QueryProcessor:
70
  logger.error(f"Invalid date format: {date_str}")
71
  raise ValueError(f"Invalid date format. Expected YYYY-MM-DD, got {date_str}")
72
 
 
 
 
 
 
 
 
 
 
 
 
73
  async def _execute_semantic_search(
74
  self,
75
  query_embedding: List[float],
76
  start_date: Optional[dt],
77
  end_date: Optional[dt],
78
  topic: Optional[str],
79
- entities: List[str]
80
  ) -> List[Dict[str, Any]]:
81
  """Execute search with proper error handling"""
82
  try:
 
83
  return await self.db_service.semantic_search(
84
  query_embedding=query_embedding,
85
  start_date=start_date,
86
  end_date=end_date,
87
  topic=topic,
88
- entities=entities
89
  )
90
  except Exception as e:
91
  logger.error(f"Semantic search failed: {str(e)}")
@@ -94,10 +100,11 @@ class QueryProcessor:
94
  def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
95
  """Generate summary from articles with fallback handling"""
96
  try:
97
- # Extract and process content
98
  sentences = []
99
- for article in articles:
100
- if content := article.get("content"):
 
101
  sentences.extend(self.nlp_model.tokenize_sentences(content))
102
 
103
  if not sentences:
@@ -107,17 +114,16 @@ class QueryProcessor:
107
  "key_sentences": []
108
  }
109
 
110
- # Generate summary
111
  embeddings = self.embedding_model.encode(sentences)
112
  similarity_matrix = np.inner(embeddings, embeddings)
113
  centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
114
 
115
- # Get top 10 most central sentences
116
  top_indices = np.argsort(-centrality_scores)[:10]
117
  key_sentences = [sentences[idx].strip() for idx in top_indices]
 
118
 
119
  return {
120
- "summary": self.summarization_model.summarize(' '.join(key_sentences)),
121
  "key_sentences": key_sentences
122
  }
123
 
 
29
 
30
  # Query processing
31
  query_embedding = self.embedding_model.encode(query).tolist()
 
 
 
32
  entities = self.nlp_model.extract_entities(query)
 
33
 
34
  # Database search
35
+ articles = await self._execute_search(
36
  query_embedding,
37
  start_dt,
38
  end_dt,
39
  topic,
40
+ [ent[0] for ent in entities]
41
  )
42
 
43
  if not articles:
 
44
  return {"message": "No articles found", "articles": []}
45
 
46
  # Summary generation
47
  summary_data = self._generate_summary(articles)
 
48
  return {
49
  "summary": summary_data["summary"],
50
  "key_sentences": summary_data["key_sentences"],
 
64
  logger.error(f"Invalid date format: {date_str}")
65
  raise ValueError(f"Invalid date format. Expected YYYY-MM-DD, got {date_str}")
66
 
67
+ def _extract_entities_safely(self, text: str) -> List[Tuple[str, str]]:
68
+ """Robust entity extraction handling both strings and lists"""
69
+ try:
70
+ if isinstance(text, list):
71
+ logger.warning("Received list input for entity extraction, joining to string")
72
+ text = " ".join(text)
73
+ return self.nlp_model.extract_entities(text)
74
+ except Exception as e:
75
+ logger.error(f"Entity extraction failed: {str(e)}")
76
+ return []
77
+
78
  async def _execute_semantic_search(
79
  self,
80
  query_embedding: List[float],
81
  start_date: Optional[dt],
82
  end_date: Optional[dt],
83
  topic: Optional[str],
84
+ entities: List[Tuple[str, str]]
85
  ) -> List[Dict[str, Any]]:
86
  """Execute search with proper error handling"""
87
  try:
88
+ entity_texts = [ent[0] for ent in entities]
89
  return await self.db_service.semantic_search(
90
  query_embedding=query_embedding,
91
  start_date=start_date,
92
  end_date=end_date,
93
  topic=topic,
94
+ entities=entity_texts
95
  )
96
  except Exception as e:
97
  logger.error(f"Semantic search failed: {str(e)}")
 
100
  def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
101
  """Generate summary from articles with fallback handling"""
102
  try:
103
+ contents = [article["content"] for article in articles]
104
  sentences = []
105
+
106
+ for content in contents:
107
+ if content:
108
  sentences.extend(self.nlp_model.tokenize_sentences(content))
109
 
110
  if not sentences:
 
114
  "key_sentences": []
115
  }
116
 
 
117
  embeddings = self.embedding_model.encode(sentences)
118
  similarity_matrix = np.inner(embeddings, embeddings)
119
  centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
120
 
 
121
  top_indices = np.argsort(-centrality_scores)[:10]
122
  key_sentences = [sentences[idx].strip() for idx in top_indices]
123
+ combined_text = ' '.join(key_sentences)
124
 
125
  return {
126
+ "summary": self.summarization_model.summarize(combined_text),
127
  "key_sentences": key_sentences
128
  }
129