Spaces:
Running
Running
File size: 2,975 Bytes
5e433de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# This module is responsible for converting text data into embeddings using the
# OpenAI API and storing in Faiss database.
import faiss
import tiktoken
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from typing import List, Tuple
from uuid import uuid4
from dotenv import load_dotenv
import logging
# other imports
from dataloader import dataloader
logging.basicConfig(level=logging.INFO)
def main(folder_path: str)-> None:
"""
Main function to convert text data into embeddings and store them in a Faiss database.
The function uses the OpenAI API to generate embeddings and the Faiss library
to manage the index.
Args:
folder_path (str): path to the folder containing the data files.
"""
logging.info("Loading environment variables...")
load_dotenv() # Load environment variables from .env file
logging.info("Environment variables loaded.")
logging.info("Loading OpenAI embeddings...")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
logging.info("OpenAI embeddings loaded.")
logging.info("Creating Faiss index...")
# Create a Faiss inde
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
# load the emcoder to calculate the number of tokens
enc = tiktoken.get_encoding("cl100k_base")
vector_store = FAISS(
embedding_function=embeddings,
index=index,
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
logging.info("Faiss index created.")
logging.info("Loading data from folder...")
# Load the data
chunks_list, _, _, _ = dataloader(folder_path)
logging.info(f"Loaded {len(chunks_list)} chunks from folder: {folder_path}")
# calculte the number of tokens
total_tokens = sum(len(enc.encode(doc.page_content)) for doc in chunks_list)
cost = (total_tokens / 1000000) * 0.13
logging.info(f"Total tokens: {total_tokens}")
logging.info(f"Estimated cost of using text-embedding-3-large: ${cost:.2f}")
# Ask user for confirmation
proceed = input("Do you want to proceed with embedding and storing the data in Faiss? (yes/no): ").strip().lower()
if proceed not in ['yes', 'y']:
logging.info("Operation cancelled by the user.")
return
logging.info("Proceeding with embedding and storing the data in Faiss...")
logging.info("Converting text data to embeddings...")
# Convert text data to embeddings
uuids = [str(uuid4()) for _ in range(len(chunks_list))]
vector_store.add_documents(documents=chunks_list, ids=uuids)
logging.info("Text data converted to embeddings and stored in Faiss index.")
vector_store.save_local("faiss_index")
logging.info("Faiss index saved to local storage.")
if __name__ == "__main__":
folder_path = "dataset/converted_json_docs"
main(folder_path)
|