File size: 2,975 Bytes
5e433de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# This module is responsible for converting text data into embeddings using the 
# OpenAI API and storing in Faiss database.

import faiss
import tiktoken
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from typing import List, Tuple
from uuid import uuid4
from dotenv import load_dotenv
import logging
# other imports
from dataloader import dataloader

logging.basicConfig(level=logging.INFO)

def main(folder_path: str)-> None:
    """
    Main function to convert text data into embeddings and store them in a Faiss database.
    The function uses the OpenAI API to generate embeddings and the Faiss library 
    to manage the index.

    Args:
        folder_path (str): path to the folder containing the data files.
    """
    logging.info("Loading environment variables...")
    load_dotenv()  # Load environment variables from .env file
    logging.info("Environment variables loaded.")
    logging.info("Loading OpenAI embeddings...")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    logging.info("OpenAI embeddings loaded.")
    logging.info("Creating Faiss index...")

    # Create a Faiss inde
    index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
    # load the emcoder to calculate the number of tokens
    enc = tiktoken.get_encoding("cl100k_base")


    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        )
    logging.info("Faiss index created.")
    logging.info("Loading data from folder...")
    # Load the data
    chunks_list, _, _, _ = dataloader(folder_path)
    logging.info(f"Loaded {len(chunks_list)} chunks from folder: {folder_path}")
    # calculte the number of tokens
    total_tokens = sum(len(enc.encode(doc.page_content)) for doc in chunks_list)
    cost = (total_tokens / 1000000) * 0.13
    logging.info(f"Total tokens: {total_tokens}")
    logging.info(f"Estimated cost of using text-embedding-3-large: ${cost:.2f}")

    # Ask user for confirmation
    proceed = input("Do you want to proceed with embedding and storing the data in Faiss? (yes/no): ").strip().lower()
    if proceed not in ['yes', 'y']:
        logging.info("Operation cancelled by the user.")
        return
    logging.info("Proceeding with embedding and storing the data in Faiss...")

    logging.info("Converting text data to embeddings...")

    # Convert text data to embeddings

    uuids = [str(uuid4()) for _ in range(len(chunks_list))]
    vector_store.add_documents(documents=chunks_list, ids=uuids)
    logging.info("Text data converted to embeddings and stored in Faiss index.")
    vector_store.save_local("faiss_index")
    logging.info("Faiss index saved to local storage.")


if __name__ == "__main__":
    folder_path = "dataset/converted_json_docs"
    main(folder_path)