Spaces:
Running
Running
import streamlit as st | |
import pickle | |
import os | |
import json | |
from collections import defaultdict | |
from langchain.vectorstores import FAISS | |
from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
from rank_bm25 import BM25Okapi | |
# Constants | |
BASE_DIR = "built_index" | |
VECTOR_STORE_DIR = os.path.join(BASE_DIR, "vector_store") | |
BM25_INDEX_FILE = os.path.join(BASE_DIR, "bm25_index.pkl") | |
SEARCH_INDEX_FILE = os.path.join(BASE_DIR, "search_index.json") | |
# Load embedding model | |
def load_embeddings(): | |
return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
# Load indexes | |
def load_indexes(): | |
# Load search index | |
with open(SEARCH_INDEX_FILE, "r") as f: | |
index = defaultdict(dict, json.load(f)) | |
# Load vector store | |
embeddings = load_embeddings() | |
vector_store = FAISS.load_local(VECTOR_STORE_DIR, embeddings, allow_dangerous_deserialization=True) | |
# Load BM25 index | |
with open(BM25_INDEX_FILE, "rb") as f: | |
bm25, bm25_texts, url_order = pickle.load(f) | |
return index, vector_store, bm25, bm25_texts, url_order | |
# Search functions | |
def semantic_search(vector_store, query, k=5): | |
results = vector_store.similarity_search(query, k=k) | |
return [{ | |
"url": r.metadata.get("url", "N/A"), | |
"snippet": r.page_content[:200] | |
} for r in results] | |
def bm25_search(bm25, bm25_texts, url_order, index, query, k=5): | |
query_tokens = query.lower().split() | |
scores = bm25.get_scores(query_tokens) | |
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k] | |
return [{ | |
"url": url_order[i], | |
"score": scores[i], | |
"snippet": index[url_order[i]]["content"][:200] | |
} for i in top_indices] | |
# Streamlit UI | |
def main(): | |
st.set_page_config(page_title="LangChain Search Engine") | |
st.title("LangChain Search Engine ๐") | |
st.markdown("Using Dense Search and Sparse Search. Indexed on April 02, 2025") | |
st.markdown("for more details visit https://github.com/balnarendrasapa/search-engine") | |
query = st.text_input("Enter your search query:", "") | |
if query: | |
index, vector_store, bm25, bm25_texts, url_order = load_indexes() | |
with st.spinner("Searching..."): | |
sem_results = semantic_search(vector_store, query) | |
bm25_results = bm25_search(bm25, bm25_texts, url_order, index, query) | |
st.subheader("๐ Semantic Search Results") | |
for i, res in enumerate(sem_results, 1): | |
st.markdown(f"**{i}. [{res['url']}]({res['url']})**") | |
st.write(res['snippet'] + "...") | |
st.subheader("๐งฎ BM25 Sparse Search Results") | |
for i, res in enumerate(bm25_results, 1): | |
st.markdown(f"**{i}. [{res['url']}]({res['url']})** (Score: {res['score']:.2f})") | |
st.write(res['snippet'] + "...") | |
if __name__ == "__main__": | |
main() | |