|
import datetime |
|
from typing import List, Dict, Any, Optional |
|
import numpy as np |
|
from models.LexRank import degree_centrality_scores |
|
|
|
class QueryProcessor: |
|
def __init__(self, embedding_model, summarization_model, nlp_model, db_service): |
|
self.embedding_model = embedding_model |
|
self.summarization_model = summarization_model |
|
self.nlp_model = nlp_model |
|
self.db_service = db_service |
|
|
|
async def process( |
|
self, |
|
query: str, |
|
topic: Optional[str] = None, |
|
start_date: Optional[str] = None, |
|
end_date: Optional[str] = None |
|
) -> Dict[str, Any]: |
|
|
|
start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None |
|
end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None |
|
|
|
|
|
query_embedding = self.embedding_model.encode(query).tolist() |
|
|
|
|
|
doc = self.nlp_model(query) |
|
entities = [ent.text.lower() for ent in doc.ents] |
|
|
|
|
|
articles = await self.db_service.semantic_search( |
|
query_embedding=query_embedding, |
|
start_date=start_dt, |
|
end_date=end_dt, |
|
topic=topic, |
|
entities=entities |
|
) |
|
|
|
if not articles: |
|
return {"error": "No articles found matching the criteria"} |
|
|
|
|
|
contents = [article["content"] for article in articles] |
|
sentences = [] |
|
for content in contents: |
|
sentences.extend(self.nlp_model.tokenize_sentences(content)) |
|
|
|
|
|
if sentences: |
|
embeddings = self.embedding_model.encode(sentences) |
|
similarity_matrix = np.inner(embeddings, embeddings) |
|
centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None) |
|
|
|
top_indices = np.argsort(-centrality_scores)[0:10] |
|
key_sentences = [sentences[idx].strip() for idx in top_indices] |
|
combined_text = ' '.join(key_sentences) |
|
|
|
summary = self.summarization_model.summarize(combined_text) |
|
else: |
|
key_sentences = [] |
|
summary = "No content available for summarization" |
|
|
|
return { |
|
"summary": summary, |
|
"articles": articles |
|
} |