Quintino Fernandes commited on
Commit
a86dbdc
·
1 Parent(s): ba99a45

Similarity matrix fix, another entity fix, more print stuff and a bit of model fine tune

Browse files
database/query.py CHANGED
@@ -19,7 +19,7 @@ class DatabaseService:
19
  start_date: Optional[datetime] = None,
20
  end_date: Optional[datetime] = None,
21
  topic: Optional[str] = None,
22
- entities: Optional[List[str]] = None,
23
  limit: int = 10
24
  ) -> List[Dict[str, any]]:
25
  try:
@@ -31,9 +31,6 @@ class DatabaseService:
31
  dbname=self.DB_NAME
32
  ) as conn:
33
  with conn.cursor() as cursor:
34
- # Enable unaccent extension if not already enabled
35
- cursor.execute("CREATE EXTENSION IF NOT EXISTS unaccent;")
36
-
37
  # Base query with date range and topic filters
38
  base_query = sql.SQL('''
39
  WITH filtered_articles AS (
@@ -124,7 +121,7 @@ class DatabaseService:
124
 
125
  # Fallback: Retry with fewer filters if no results
126
  if not articles:
127
- print("No articles found with all filters. Relaxing filters...")
128
  fallback_query = sql.SQL('''
129
  SELECT
130
  content,
 
19
  start_date: Optional[datetime] = None,
20
  end_date: Optional[datetime] = None,
21
  topic: Optional[str] = None,
22
+ entities: Optional[List[tuple[str,str]]] = None,
23
  limit: int = 10
24
  ) -> List[Dict[str, any]]:
25
  try:
 
31
  dbname=self.DB_NAME
32
  ) as conn:
33
  with conn.cursor() as cursor:
 
 
 
34
  # Base query with date range and topic filters
35
  base_query = sql.SQL('''
36
  WITH filtered_articles AS (
 
121
 
122
  # Fallback: Retry with fewer filters if no results
123
  if not articles:
124
+ print("No articles found with entities...")
125
  fallback_query = sql.SQL('''
126
  SELECT
127
  content,
database/query_processor.py CHANGED
@@ -30,6 +30,7 @@ class QueryProcessor:
30
  # Query processing
31
  query_embedding = self.embedding_model.encode(query).tolist()
32
  entities = self.nlp_model.extract_entities(query)
 
33
 
34
  # Database search
35
  articles = await self._execute_semantic_search(
@@ -44,6 +45,7 @@ class QueryProcessor:
44
  return {"message": "No articles found", "articles": []}
45
 
46
  # Summary generation
 
47
  summary_data = self._generate_summary(articles)
48
  return {
49
  "summary": summary_data["summary"],
@@ -113,15 +115,19 @@ class QueryProcessor:
113
  "summary": "No content available for summarization",
114
  "key_sentences": []
115
  }
116
-
 
117
  embeddings = self.embedding_model.encode(sentences)
118
- similarity_matrix = np.inner(embeddings, embeddings)
 
119
  centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
120
 
121
  top_indices = np.argsort(-centrality_scores)[:10]
122
  key_sentences = [sentences[idx].strip() for idx in top_indices]
123
  combined_text = ' '.join(key_sentences)
124
 
 
 
125
  return {
126
  "summary": self.summarization_model.summarize(combined_text),
127
  "key_sentences": key_sentences
 
30
  # Query processing
31
  query_embedding = self.embedding_model.encode(query).tolist()
32
  entities = self.nlp_model.extract_entities(query)
33
+ print(f"Extracted entities: {entities}")
34
 
35
  # Database search
36
  articles = await self._execute_semantic_search(
 
45
  return {"message": "No articles found", "articles": []}
46
 
47
  # Summary generation
48
+ print("Starting summary generation")
49
  summary_data = self._generate_summary(articles)
50
  return {
51
  "summary": summary_data["summary"],
 
115
  "summary": "No content available for summarization",
116
  "key_sentences": []
117
  }
118
+
119
+ print("Starting first summary generation")
120
  embeddings = self.embedding_model.encode(sentences)
121
+ print("Embeddings generated first summary")
122
+ similarity_matrix = self.embedding_model.similarity(embeddings, embeddings).numpy
123
  centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
124
 
125
  top_indices = np.argsort(-centrality_scores)[:10]
126
  key_sentences = [sentences[idx].strip() for idx in top_indices]
127
  combined_text = ' '.join(key_sentences)
128
 
129
+ print(f"First summary done with: {len(key_sentences)} sentences")
130
+
131
  return {
132
  "summary": self.summarization_model.summarize(combined_text),
133
  "key_sentences": key_sentences
models/summarization.py CHANGED
@@ -19,9 +19,9 @@ class SummarizationModel:
19
  inputs,
20
  max_length=max_length,
21
  min_length=min_length,
22
- num_beams=5,
23
  no_repeat_ngram_size=3,
24
- early_stopping=False
25
  )
26
 
27
  return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 
19
  inputs,
20
  max_length=max_length,
21
  min_length=min_length,
22
+ num_beams=4,
23
  no_repeat_ngram_size=3,
24
+ early_stopping=True,
25
  )
26
 
27
  return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)