File size: 2,890 Bytes
945eb11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a24d1f9
b4c3246
 
a24d1f9
945eb11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
import pickle
import os
import json

from collections import defaultdict
from langchain.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from rank_bm25 import BM25Okapi

# Constants
BASE_DIR = "built_index"
VECTOR_STORE_DIR = os.path.join(BASE_DIR, "vector_store")
BM25_INDEX_FILE = os.path.join(BASE_DIR, "bm25_index.pkl")
SEARCH_INDEX_FILE = os.path.join(BASE_DIR, "search_index.json")

# Load embedding model
@st.cache_resource
def load_embeddings():
    return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load indexes
@st.cache_resource
def load_indexes():
    # Load search index
    with open(SEARCH_INDEX_FILE, "r") as f:
        index = defaultdict(dict, json.load(f))

    # Load vector store
    embeddings = load_embeddings()
    vector_store = FAISS.load_local(VECTOR_STORE_DIR, embeddings, allow_dangerous_deserialization=True)

    # Load BM25 index
    with open(BM25_INDEX_FILE, "rb") as f:
        bm25, bm25_texts, url_order = pickle.load(f)

    return index, vector_store, bm25, bm25_texts, url_order

# Search functions
def semantic_search(vector_store, query, k=5):
    results = vector_store.similarity_search(query, k=k)
    return [{
        "url": r.metadata.get("url", "N/A"),
        "snippet": r.page_content[:200]
    } for r in results]

def bm25_search(bm25, bm25_texts, url_order, index, query, k=5):
    query_tokens = query.lower().split()
    scores = bm25.get_scores(query_tokens)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [{
        "url": url_order[i],
        "score": scores[i],
        "snippet": index[url_order[i]]["content"][:200]
    } for i in top_indices]

# Streamlit UI
def main():
    st.set_page_config(page_title="LangChain Search Engine")
    st.title("LangChain Search Engine ๐Ÿ”")
    st.markdown("Using Dense Search and Sparse Search. Indexed on April 02, 2025")
    st.markdown("for more details visit https://github.com/balnarendrasapa/search-engine")

    query = st.text_input("Enter your search query:", "")

    if query:
        index, vector_store, bm25, bm25_texts, url_order = load_indexes()
        
        with st.spinner("Searching..."):
            sem_results = semantic_search(vector_store, query)
            bm25_results = bm25_search(bm25, bm25_texts, url_order, index, query)

        st.subheader("๐Ÿ”Ž Semantic Search Results")
        for i, res in enumerate(sem_results, 1):
            st.markdown(f"**{i}. [{res['url']}]({res['url']})**")
            st.write(res['snippet'] + "...")

        st.subheader("๐Ÿงฎ BM25 Sparse Search Results")
        for i, res in enumerate(bm25_results, 1):
            st.markdown(f"**{i}. [{res['url']}]({res['url']})** (Score: {res['score']:.2f})")
            st.write(res['snippet'] + "...")

if __name__ == "__main__":
    main()