Quintino Fernandes
commited on
Commit
·
a86dbdc
1
Parent(s):
ba99a45
Similarity matrix fix, another entity fix, more print stuff and a bit of model fine tune
Browse files- database/query.py +2 -5
- database/query_processor.py +8 -2
- models/summarization.py +2 -2
database/query.py
CHANGED
@@ -19,7 +19,7 @@ class DatabaseService:
|
|
19 |
start_date: Optional[datetime] = None,
|
20 |
end_date: Optional[datetime] = None,
|
21 |
topic: Optional[str] = None,
|
22 |
-
entities: Optional[List[str]] = None,
|
23 |
limit: int = 10
|
24 |
) -> List[Dict[str, any]]:
|
25 |
try:
|
@@ -31,9 +31,6 @@ class DatabaseService:
|
|
31 |
dbname=self.DB_NAME
|
32 |
) as conn:
|
33 |
with conn.cursor() as cursor:
|
34 |
-
# Enable unaccent extension if not already enabled
|
35 |
-
cursor.execute("CREATE EXTENSION IF NOT EXISTS unaccent;")
|
36 |
-
|
37 |
# Base query with date range and topic filters
|
38 |
base_query = sql.SQL('''
|
39 |
WITH filtered_articles AS (
|
@@ -124,7 +121,7 @@ class DatabaseService:
|
|
124 |
|
125 |
# Fallback: Retry with fewer filters if no results
|
126 |
if not articles:
|
127 |
-
print("No articles found with
|
128 |
fallback_query = sql.SQL('''
|
129 |
SELECT
|
130 |
content,
|
|
|
19 |
start_date: Optional[datetime] = None,
|
20 |
end_date: Optional[datetime] = None,
|
21 |
topic: Optional[str] = None,
|
22 |
+
entities: Optional[List[tuple[str,str]]] = None,
|
23 |
limit: int = 10
|
24 |
) -> List[Dict[str, any]]:
|
25 |
try:
|
|
|
31 |
dbname=self.DB_NAME
|
32 |
) as conn:
|
33 |
with conn.cursor() as cursor:
|
|
|
|
|
|
|
34 |
# Base query with date range and topic filters
|
35 |
base_query = sql.SQL('''
|
36 |
WITH filtered_articles AS (
|
|
|
121 |
|
122 |
# Fallback: Retry with fewer filters if no results
|
123 |
if not articles:
|
124 |
+
print("No articles found with entities...")
|
125 |
fallback_query = sql.SQL('''
|
126 |
SELECT
|
127 |
content,
|
database/query_processor.py
CHANGED
@@ -30,6 +30,7 @@ class QueryProcessor:
|
|
30 |
# Query processing
|
31 |
query_embedding = self.embedding_model.encode(query).tolist()
|
32 |
entities = self.nlp_model.extract_entities(query)
|
|
|
33 |
|
34 |
# Database search
|
35 |
articles = await self._execute_semantic_search(
|
@@ -44,6 +45,7 @@ class QueryProcessor:
|
|
44 |
return {"message": "No articles found", "articles": []}
|
45 |
|
46 |
# Summary generation
|
|
|
47 |
summary_data = self._generate_summary(articles)
|
48 |
return {
|
49 |
"summary": summary_data["summary"],
|
@@ -113,15 +115,19 @@ class QueryProcessor:
|
|
113 |
"summary": "No content available for summarization",
|
114 |
"key_sentences": []
|
115 |
}
|
116 |
-
|
|
|
117 |
embeddings = self.embedding_model.encode(sentences)
|
118 |
-
|
|
|
119 |
centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
|
120 |
|
121 |
top_indices = np.argsort(-centrality_scores)[:10]
|
122 |
key_sentences = [sentences[idx].strip() for idx in top_indices]
|
123 |
combined_text = ' '.join(key_sentences)
|
124 |
|
|
|
|
|
125 |
return {
|
126 |
"summary": self.summarization_model.summarize(combined_text),
|
127 |
"key_sentences": key_sentences
|
|
|
30 |
# Query processing
|
31 |
query_embedding = self.embedding_model.encode(query).tolist()
|
32 |
entities = self.nlp_model.extract_entities(query)
|
33 |
+
print(f"Extracted entities: {entities}")
|
34 |
|
35 |
# Database search
|
36 |
articles = await self._execute_semantic_search(
|
|
|
45 |
return {"message": "No articles found", "articles": []}
|
46 |
|
47 |
# Summary generation
|
48 |
+
print("Starting summary generation")
|
49 |
summary_data = self._generate_summary(articles)
|
50 |
return {
|
51 |
"summary": summary_data["summary"],
|
|
|
115 |
"summary": "No content available for summarization",
|
116 |
"key_sentences": []
|
117 |
}
|
118 |
+
|
119 |
+
print("Starting first summary generation")
|
120 |
embeddings = self.embedding_model.encode(sentences)
|
121 |
+
print("Embeddings generated first summary")
|
122 |
+
similarity_matrix = self.embedding_model.similarity(embeddings, embeddings).numpy
|
123 |
centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
|
124 |
|
125 |
top_indices = np.argsort(-centrality_scores)[:10]
|
126 |
key_sentences = [sentences[idx].strip() for idx in top_indices]
|
127 |
combined_text = ' '.join(key_sentences)
|
128 |
|
129 |
+
print(f"First summary done with: {len(key_sentences)} sentences")
|
130 |
+
|
131 |
return {
|
132 |
"summary": self.summarization_model.summarize(combined_text),
|
133 |
"key_sentences": key_sentences
|
models/summarization.py
CHANGED
@@ -19,9 +19,9 @@ class SummarizationModel:
|
|
19 |
inputs,
|
20 |
max_length=max_length,
|
21 |
min_length=min_length,
|
22 |
-
num_beams=
|
23 |
no_repeat_ngram_size=3,
|
24 |
-
early_stopping=
|
25 |
)
|
26 |
|
27 |
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
|
|
19 |
inputs,
|
20 |
max_length=max_length,
|
21 |
min_length=min_length,
|
22 |
+
num_beams=4,
|
23 |
no_repeat_ngram_size=3,
|
24 |
+
early_stopping=True,
|
25 |
)
|
26 |
|
27 |
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|