Spaces:

thenativefox
/

RAG

Sleeping

RAG

File size: 2,431 Bytes

b7f4e8c
 
 
 
3f0e240
c44b083
a7432c6
 
 
 
 
c44b083
 
 
 
bd075c2
1c860fb
bd075c2
a7432c6
bd075c2
 
 
 
 
 
 
1c860fb
3f0e240
b7f4e8c
1c860fb
 
 
 
 
 
 
 
 
 
 
c1d292c
 
 
9751345
b7f4e8c
 
 
 
 
 
9751345
 
 
 
 
 
 
 
 
 
b7f4e8c
 
9751345
 
b7f4e8c
 
 
 
 
 
 
 
c44b083
 
bd075c2

import lancedb
import os
import gradio as gr
from sentence_transformers import SentenceTransformer
from pathlib import Path
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables from the .env file
load_dotenv()

# Determine the LanceDB path and log it
current_working_dir = Path(os.getcwd())
db_path = current_working_dir / ".lancedb"
logger.info(f"Database path: {db_path}")

# List contents of the LanceDB directory
if db_path.exists():
    lancedb_contents = os.listdir(db_path)
    logger.info(f"Contents of the LanceDB directory: {lancedb_contents}")
else:
    logger.error(f"LanceDB directory does not exist at path: {db_path}")

db = lancedb.connect(db_path)

# List and log all tables in the database
table_names = db.table_names()
logger.info(f"Available LanceDB Tables: {table_names}")

model1_fixed_path = db_path / "model1_fixed.lance"
if model1_fixed_path.exists():
    model1_fixed_contents = os.listdir(model1_fixed_path)
    logger.info(f"Contents of the model1_fixed.lance folder: {model1_fixed_contents}")
else:
    logger.error(f"model1_fixed.lance directory does not exist at path: {model1_fixed_path}")

MODEL1_STRATEGY1 = "model1_fixed"
MODEL2_STRATEGY1 = "model2_fixed"
MODEL3_STRATEGY1 = "model3_fixed"

VECTOR_COLUMN = os.getenv("VECTOR_COLUMN", "vector")
TEXT_COLUMN = os.getenv("TEXT_COLUMN", "text")
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 32))

retriever = SentenceTransformer(os.getenv("EMB_MODEL"))

def get_table_name():
    emb_model = os.getenv("EMB_MODEL")
    if emb_model == "sentence-transformers/all-MiniLM-L6-v2":
        return MODEL1_STRATEGY1
    elif emb_model == "BAAI/bge-large-en-v1.5":
        return MODEL2_STRATEGY1
    elif emb_model == "openai/text-embedding-ada-002":
        return MODEL3_STRATEGY1
    else:
        raise ValueError(f"Unsupported embedding model: {emb_model}")

def retrieve(query, k):
    table_name = get_table_name()
    TABLE = db.open_table(table_name)
    query_vec = retriever.encode(query)
    try:
        documents = TABLE.search(query_vec, vector_column_name=VECTOR_COLUMN).limit(k).to_list()
        documents = [doc[TEXT_COLUMN] for doc in documents]

        return documents

    except Exception as e:
        raise gr.Error(str(e))
    
if __name__ == "__main__":
    res = retrieve("What is transformer?", 4)
    print(res)