# This module is responsible for converting text data into embeddings using the # OpenAI API and storing in Faiss database. import faiss import tiktoken from langchain_community.docstore.in_memory import InMemoryDocstore from langchain_community.vectorstores import FAISS from langchain_openai import OpenAIEmbeddings from typing import List, Tuple from uuid import uuid4 from dotenv import load_dotenv import logging # other imports from dataloader import dataloader logging.basicConfig(level=logging.INFO) def main(folder_path: str)-> None: """ Main function to convert text data into embeddings and store them in a Faiss database. The function uses the OpenAI API to generate embeddings and the Faiss library to manage the index. Args: folder_path (str): path to the folder containing the data files. """ logging.info("Loading environment variables...") load_dotenv() # Load environment variables from .env file logging.info("Environment variables loaded.") logging.info("Loading OpenAI embeddings...") embeddings = OpenAIEmbeddings(model="text-embedding-3-large") logging.info("OpenAI embeddings loaded.") logging.info("Creating Faiss index...") # Create a Faiss inde index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))) # load the emcoder to calculate the number of tokens enc = tiktoken.get_encoding("cl100k_base") vector_store = FAISS( embedding_function=embeddings, index=index, docstore=InMemoryDocstore(), index_to_docstore_id={}, ) logging.info("Faiss index created.") logging.info("Loading data from folder...") # Load the data chunks_list, _, _, _ = dataloader(folder_path) logging.info(f"Loaded {len(chunks_list)} chunks from folder: {folder_path}") # calculte the number of tokens total_tokens = sum(len(enc.encode(doc.page_content)) for doc in chunks_list) cost = (total_tokens / 1000000) * 0.13 logging.info(f"Total tokens: {total_tokens}") logging.info(f"Estimated cost of using text-embedding-3-large: ${cost:.2f}") # Ask user for confirmation proceed = input("Do you want to proceed with embedding and storing the data in Faiss? (yes/no): ").strip().lower() if proceed not in ['yes', 'y']: logging.info("Operation cancelled by the user.") return logging.info("Proceeding with embedding and storing the data in Faiss...") logging.info("Converting text data to embeddings...") # Convert text data to embeddings uuids = [str(uuid4()) for _ in range(len(chunks_list))] vector_store.add_documents(documents=chunks_list, ids=uuids) logging.info("Text data converted to embeddings and stored in Faiss index.") vector_store.save_local("faiss_index") logging.info("Faiss index saved to local storage.") if __name__ == "__main__": folder_path = "dataset/converted_json_docs" main(folder_path)