File size: 2,528 Bytes
28ec96b a2682b3 28ec96b a2682b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from models.LexRank import degree_centrality_scores
class QueryProcessor:
def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
self.embedding_model = embedding_model
self.summarization_model = summarization_model
self.nlp_model = nlp_model
self.db_service = db_service
async def process(
self,
query: str,
topic: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None
) -> Dict[str, Any]:
# Convert string dates to datetime objects
start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None
# Get query embedding
query_embedding = self.embedding_model.encode(query).tolist()
# Get entities from the query
doc = self.nlp_model(query)
entities = [ent.text.lower() for ent in doc.ents] # Extract entity texts
# Semantic search with entities
articles = await self.db_service.semantic_search(
query_embedding=query_embedding,
start_date=start_dt,
end_date=end_dt,
topic=topic,
entities=entities # Pass entities to the search
)
if not articles:
return {"error": "No articles found matching the criteria"}
# Step 3: Process results
contents = [article["content"] for article in articles]
sentences = []
for content in contents:
sentences.extend(self.nlp_model.tokenize_sentences(content))
# Step 4: Generate summary
if sentences:
embeddings = self.embedding_model.encode(sentences)
similarity_matrix = np.inner(embeddings, embeddings)
centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
top_indices = np.argsort(-centrality_scores)[0:10]
key_sentences = [sentences[idx].strip() for idx in top_indices]
combined_text = ' '.join(key_sentences)
summary = self.summarization_model.summarize(combined_text)
else:
key_sentences = []
summary = "No content available for summarization"
return {
"summary": summary,
"articles": articles
} |