File size: 2,528 Bytes
28ec96b
 
a2682b3
28ec96b
a2682b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from models.LexRank import degree_centrality_scores

class QueryProcessor:
    def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
        self.embedding_model = embedding_model
        self.summarization_model = summarization_model
        self.nlp_model = nlp_model
        self.db_service = db_service
    
    async def process(
        self,
        query: str,
        topic: Optional[str] = None,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None
    ) -> Dict[str, Any]:
        # Convert string dates to datetime objects
        start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
        end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None
        
        # Get query embedding
        query_embedding = self.embedding_model.encode(query).tolist()

        # Get entities from the query
        doc = self.nlp_model(query)
        entities = [ent.text.lower() for ent in doc.ents]  # Extract entity texts

        # Semantic search with entities
        articles = await self.db_service.semantic_search(
            query_embedding=query_embedding,
            start_date=start_dt,
            end_date=end_dt,
            topic=topic,
            entities=entities  # Pass entities to the search
        )
        
        if not articles:
            return {"error": "No articles found matching the criteria"}
        
        # Step 3: Process results
        contents = [article["content"] for article in articles]
        sentences = []
        for content in contents:
            sentences.extend(self.nlp_model.tokenize_sentences(content))
        
        # Step 4: Generate summary
        if sentences:
            embeddings = self.embedding_model.encode(sentences)
            similarity_matrix = np.inner(embeddings, embeddings)
            centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
            
            top_indices = np.argsort(-centrality_scores)[0:10]
            key_sentences = [sentences[idx].strip() for idx in top_indices]
            combined_text = ' '.join(key_sentences)
            
            summary = self.summarization_model.summarize(combined_text)
        else:
            key_sentences = []
            summary = "No content available for summarization"
        
        return {
            "summary": summary,
            "articles": articles
        }