Used this to migrate vectors to pinecone from our faiss indices. I recommend you use our scripts to ingest your data directly into Pinecone. For this, direct it to a folder containing the index.faiss and index.pkl files that you want to ingest into pinecone.

In [None]:
import getpass
import os
import time
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

  from tqdm.autonotebook import tqdm


In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
import os
from langchain_community.vectorstores import FAISS
from pinecone import Pinecone, ServerlessSpec
from langchain_community.embeddings import OpenAIEmbeddings
from tqdm import tqdm
from langchain_pinecone import PineconeVectorStore

def migrate_faiss_to_pinecone(
    faiss_index_path: str,
    pinecone_api_key: str,
    index_name: str,
    batch_size: int = 100
):
    """
    Migrate a local FAISS index to Pinecone.
    
    Args:
        faiss_index_path: Path to the local FAISS index
        pinecone_api_key: Your Pinecone API key
        pinecone_environment: Pinecone environment (e.g., "us-east1-gcp")
        index_name: Name of the Pinecone index to create/use
        batch_size: Number of vectors to upload in each batch
    """
    # Load the local FAISS index
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    faiss_vectorstore = FAISS.load_local(faiss_index_path, embeddings,allow_dangerous_deserialization=True)
    pc = Pinecone(api_key=pinecone_api_key)

    index = pc.Index(index_name)
    
    # Get all the vectors and documents from FAISS
    all_docs = faiss_vectorstore.docstore._dict
    docs = dict()

    for uuid in faiss_vectorstore.docstore._dict:
        doc = faiss_vectorstore.docstore._dict[uuid]
        # print(doc)
        if doc.metadata['field'] in ['abstract_tsi','title_info_primary_tsi','title_info_primary_subtitle_tsi', 'title_info_alternative_tsim']:
            if len(doc.page_content) > 3:
                docs[uuid] = doc

    total_docs = len(docs)
    
    pinecone_vectorstore = PineconeVectorStore(index=index, embedding=embeddings)

    # Batch processing
    for i in tqdm(range(0, total_docs, batch_size)):
        batch_ids = list(docs.keys())[i:i + batch_size]
        batch_docs = [docs[doc_id] for doc_id in batch_ids]
        batch_embeddings = [faiss_vectorstore.index.reconstruct(j).tolist() 
                          for j in range(i, min(i + batch_size, total_docs))]
        
        # Create metadata for each document
        metadatas = [doc.metadata for doc in batch_docs]
        texts = [doc.page_content for doc in batch_docs]
        # print(batch_docs)
        # Add vectors to Pinecone
        pinecone_vectorstore.add_texts(
            texts=texts,
            metadatas=metadatas,
            embeddings=batch_embeddings,
            ids=batch_ids
        )
    
    print(f"Successfully migrated {total_docs} documents to Pinecone index '{index_name}'")
    return pinecone_vectorstore

# Example usage:
if __name__ == "__main__":
    # Set your credentials and paths
    FAISS_INDEX_PATH = "faiss_900_1200"
    PINECONE_API_KEY = "pcsk_47kPH2_665LiydNVZXrhKkZgx7eNJ5bjEChMWhp6Vx2fUrShiNXRZ2rSCdonUiAkUTDJ7n"
    INDEX_NAME = "bpl-rag"
    
    # Perform migration
    pinecone_vs = migrate_faiss_to_pinecone(
        faiss_index_path=FAISS_INDEX_PATH,
        pinecone_api_key=PINECONE_API_KEY,
        index_name=INDEX_NAME,
        batch_size=100
    )

100%|██████████| 4685/4685 [1:57:28<00:00,  1.50s/it]  


Successfully migrated 468455 documents to Pinecone index 'bpl-rag'
