Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

App Files Files Community

gavinzli commited on 24 days ago

Commit

1a8947e

1 Parent(s): b9d91b4

Refactor vectorizer module: clean up commented code and improve initialization logging

Browse files

Files changed (2) hide show

controllers/vectorizer.py +39 -58
test.py +80 -0

controllers/vectorizer.py CHANGED Viewed

@@ -6,13 +6,13 @@ import time
 import tiktoken
 import pandas as pd
 from langchain_astradb import AstraDBVectorStore
-from langchain_openai import AzureOpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import DataFrameLoader
-from astrapy import DataAPIClient
-from astrapy.info import CollectionVectorServiceOptions
-from astrapy.exceptions import CollectionAlreadyExistsException
-from astrapy.core.api import APIRequestError
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
@@ -20,62 +20,43 @@ logging.basicConfig(
     level=logging.ERROR)
 # from astrapy import AstraClient
-ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
-ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
-COLLECTION_NAME = "article"
-VECTOR_OPTIONS = CollectionVectorServiceOptions(
-    provider="azureOpenAI",
-    model_name="text-embedding-3-small",
-    authentication={"providerKey": "AZURE_OPENAI_API_KEY"},
-    parameters={
-        "resourceName": "openai-oe",
-        "deploymentId": "text-embedding-3-small",
-    },
-)
-client = DataAPIClient(token=ASTRA_DB_APPLICATION_TOKEN)
-database = client.get_database(ASTRA_DB_API_ENDPOINT)
-embedding = AzureOpenAIEmbeddings(
-    api_version="2024-07-01-preview",
-    azure_endpoint="https://openai-oe.openai.azure.com/")
-try:
-    # Try to create the collection
-    database.create_collection(
-        COLLECTION_NAME,
-        dimension=1536,  # Default dimension for text-embedding-3-small
-        metric="cosine",
-        service=VECTOR_OPTIONS
-    )
-    logging.info("Created new collection '%s'", COLLECTION_NAME)
-except (CollectionAlreadyExistsException, APIRequestError) as e:
-    logging.info("Collection '%s' already exists. Error Message: %s", COLLECTION_NAME, e)
-    collection = database.get_collection(COLLECTION_NAME)
 vstore = AstraDBVectorStore(
-    collection_name=COLLECTION_NAME,
     namespace="default_keyspace",
-    embedding=embedding,
-    token=ASTRA_DB_APPLICATION_TOKEN,
-    api_endpoint=ASTRA_DB_API_ENDPOINT)
-# vstore = AstraDBVectorStore(
-#     collection_vector_service_options=CollectionVectorServiceOptions(
-#         provider="azureOpenAI",
-#         model_name="text-embedding-3-small",
-#         authentication={
-#             "providerKey": "AZURE_OPENAI_API_KEY",
-#         },
-#         parameters={
-#             "resourceName": "openai-oe",
-#             "deploymentId": "text-embedding-3-small",
-#         },
-#     ),
-#     namespace="default_keyspace",
-#     collection_name="article",
-#     token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
-#     api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
 def token_length(text):
     """

 import tiktoken
 import pandas as pd
 from langchain_astradb import AstraDBVectorStore
+# from langchain_openai import AzureOpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import DataFrameLoader
+# from astrapy import DataAPIClient
+# from astrapy.info import CollectionVectorServiceOptions
+# from astrapy.exceptions import CollectionAlreadyExistsException
+# from astrapy.core.api import APIRequestError
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
     level=logging.ERROR)
 # from astrapy import AstraClient
+# ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
+# ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
+# COLLECTION_NAME = "article"
+# VECTOR_OPTIONS = CollectionVectorServiceOptions(
+#     provider="azureOpenAI",
+#     model_name="text-embedding-3-small",
+#     authentication={"providerKey": "AZURE_OPENAI_API_KEY"},
+#     parameters={
+#         "resourceName": "openai-oe",
+#         "deploymentId": "text-embedding-3-small",
+#     },
+# )
+# client = DataAPIClient(token=ASTRA_DB_APPLICATION_TOKEN)
+# database = client.get_database(ASTRA_DB_API_ENDPOINT)
+# embedding = AzureOpenAIEmbeddings(
+#     api_version="2024-07-01-preview",
+#     azure_endpoint="https://openai-oe.openai.azure.com/")
 vstore = AstraDBVectorStore(
+    # collection_vector_service_options=CollectionVectorServiceOptions(
+    #     provider="azureOpenAI",
+    #     model_name="text-embedding-3-small",
+    #     authentication={
+    #         "providerKey": "AZURE_OPENAI_API_KEY",
+    #     },
+    #     parameters={
+    #         "resourceName": "openai-oe",
+    #         "deploymentId": "text-embedding-3-small",
+    #     },
+    # ),
     namespace="default_keyspace",
+    collection_name="article",
+    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+    autodetect_collection=True)
 def token_length(text):
     """

test.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Module to upsert data into AstraDB"""
+import os
+import logging
+from langchain_astradb import AstraDBVectorStore
+from langchain_openai import AzureOpenAIEmbeddings
+from astrapy import DataAPIClient
+from astrapy.info import VectorServiceOptions
+# from astrapy.exceptions import CollectionAlreadyExistsException
+# from astrapy.core.api import APIRequestError
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO)
+logging.info("Initializing AstraDB client...")
+# from astrapy import AstraClient
+ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
+ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
+COLLECTION_NAME = "article"
+# VECTOR_OPTIONS = VectorServiceOptions(
+#     provider="azureOpenAI",
+#     model_name="text-embedding-3-small",
+#     authentication={"providerKey": "AZURE_OPENAI_API_KEY"},
+#     parameters={
+#         "resourceName": "openai-oe",
+#         "deploymentId": "text-embedding-3-small",
+#     },
+# )
+# client = DataAPIClient(token=ASTRA_DB_APPLICATION_TOKEN)
+# database = client.get_database(ASTRA_DB_API_ENDPOINT)
+# embedding = AzureOpenAIEmbeddings(
+#     api_version="2024-07-01-preview",
+#     azure_endpoint="https://openai-oe.openai.azure.com/")
+# try:
+#     # Try to create the collection
+#     database.create_collection(
+#         name = COLLECTION_NAME,
+#         dimension=1536,  # Default dimension for text-embedding-3-small
+#         metric="cosine",
+#         service=VECTOR_OPTIONS
+#     )
+#     logging.info("Created new collection '%s'", COLLECTION_NAME)
+# except (CollectionAlreadyExistsException, APIRequestError) as e:
+#     logging.info("Collection '%s' already exists. Error Message: %s", COLLECTION_NAME, e)
+#     collection = database.get_collection(COLLECTION_NAME)
+# vstore = AstraDBVectorStore(
+#     collection_name=COLLECTION_NAME,
+#     namespace="default_keyspace",
+#     embedding=embedding,
+#     token=ASTRA_DB_APPLICATION_TOKEN,
+#     api_endpoint=ASTRA_DB_API_ENDPOINT)
+vstore = AstraDBVectorStore(
+    # collection_vector_service_options=VectorServiceOptions(
+    #     provider="azureOpenAI",
+    #     model_name="text-embedding-3-small",
+    #     authentication={
+    #         "providerKey": "AZURE_OPENAI_API_KEY",
+    #     },
+    #     parameters={
+    #         "resourceName": "openai-oe",
+    #         "deploymentId": "text-embedding-3-small",
+    #     },
+    # ),
+    namespace="default_keyspace",
+    collection_name="article",
+    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+    autodetect_collection=True)
+results = vstore.similarity_search(query="thud",k=1)
+for doc in results:
+    print(f"* {doc.page_content} [{doc.metadata}]")