from rank_bm25 import BM25Okapi from langchain.vectorstores import Qdrant from chatbot.retrieval import get_vector_db from langchain_huggingface import HuggingFaceEmbeddings from sklearn.preprocessing import MinMaxScaler import numpy as np embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") def hybrid_search(query, k=6): vector_db = get_vector_db() # Dense search (Qdrant) dense_results = vector_db.similarity_search_with_score(query, k=k) # Sparse search (BM25) documents = [doc.page_content for doc, _ in dense_results] bm25 = BM25Okapi([doc.split() for doc in documents]) bm25_scores = bm25.get_scores(query.split()) # Normalize scores dense_scores = np.array([score for _, score in dense_results]) scaler = MinMaxScaler() combined_scores = 0.5 * scaler.fit_transform(dense_scores.reshape(-1, 1)).flatten() + \ 0.5 * scaler.fit_transform(np.array(bm25_scores).reshape(-1, 1)).flatten() # Sort results sorted_indices = np.argsort(combined_scores)[::-1] final_results = [documents[i] for i in sorted_indices[:k]] return "\n".join(final_results) # Example usage query = "What is AI?" results = hybrid_search(query) print(results)