File size: 1,283 Bytes
3c5f44b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from rank_bm25 import BM25Okapi
from langchain.vectorstores import Qdrant
from chatbot.retrieval import get_vector_db
from langchain_huggingface import HuggingFaceEmbeddings
from sklearn.preprocessing import MinMaxScaler
import numpy as np

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def hybrid_search(query, k=6):
    vector_db = get_vector_db()
    
    # Dense search (Qdrant)
    dense_results = vector_db.similarity_search_with_score(query, k=k)
    
    # Sparse search (BM25)
    documents = [doc.page_content for doc, _ in dense_results]
    bm25 = BM25Okapi([doc.split() for doc in documents])
    bm25_scores = bm25.get_scores(query.split())
    
    # Normalize scores
    dense_scores = np.array([score for _, score in dense_results])
    scaler = MinMaxScaler()
    combined_scores = 0.5 * scaler.fit_transform(dense_scores.reshape(-1, 1)).flatten() + \
                      0.5 * scaler.fit_transform(np.array(bm25_scores).reshape(-1, 1)).flatten()
    
    # Sort results
    sorted_indices = np.argsort(combined_scores)[::-1]
    final_results = [documents[i] for i in sorted_indices[:k]]
    
    return "\n".join(final_results)

# Example usage
query = "What is AI?"
results = hybrid_search(query)
print(results)