gavinzli commited on
Commit
1a8947e
·
1 Parent(s): b9d91b4

Refactor vectorizer module: clean up commented code and improve initialization logging

Browse files
Files changed (2) hide show
  1. controllers/vectorizer.py +39 -58
  2. test.py +80 -0
controllers/vectorizer.py CHANGED
@@ -6,13 +6,13 @@ import time
6
  import tiktoken
7
  import pandas as pd
8
  from langchain_astradb import AstraDBVectorStore
9
- from langchain_openai import AzureOpenAIEmbeddings
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain_community.document_loaders import DataFrameLoader
12
- from astrapy import DataAPIClient
13
- from astrapy.info import CollectionVectorServiceOptions
14
- from astrapy.exceptions import CollectionAlreadyExistsException
15
- from astrapy.core.api import APIRequestError
16
 
17
  logging.basicConfig(
18
  format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
@@ -20,62 +20,43 @@ logging.basicConfig(
20
  level=logging.ERROR)
21
 
22
  # from astrapy import AstraClient
23
- ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
24
- ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
25
- COLLECTION_NAME = "article"
26
- VECTOR_OPTIONS = CollectionVectorServiceOptions(
27
- provider="azureOpenAI",
28
- model_name="text-embedding-3-small",
29
- authentication={"providerKey": "AZURE_OPENAI_API_KEY"},
30
- parameters={
31
- "resourceName": "openai-oe",
32
- "deploymentId": "text-embedding-3-small",
33
- },
34
- )
35
-
36
- client = DataAPIClient(token=ASTRA_DB_APPLICATION_TOKEN)
37
- database = client.get_database(ASTRA_DB_API_ENDPOINT)
38
-
39
- embedding = AzureOpenAIEmbeddings(
40
- api_version="2024-07-01-preview",
41
- azure_endpoint="https://openai-oe.openai.azure.com/")
42
-
43
- try:
44
- # Try to create the collection
45
- database.create_collection(
46
- COLLECTION_NAME,
47
- dimension=1536, # Default dimension for text-embedding-3-small
48
- metric="cosine",
49
- service=VECTOR_OPTIONS
50
- )
51
- logging.info("Created new collection '%s'", COLLECTION_NAME)
52
- except (CollectionAlreadyExistsException, APIRequestError) as e:
53
- logging.info("Collection '%s' already exists. Error Message: %s", COLLECTION_NAME, e)
54
- collection = database.get_collection(COLLECTION_NAME)
55
 
56
  vstore = AstraDBVectorStore(
57
- collection_name=COLLECTION_NAME,
 
 
 
 
 
 
 
 
 
 
58
  namespace="default_keyspace",
59
- embedding=embedding,
60
- token=ASTRA_DB_APPLICATION_TOKEN,
61
- api_endpoint=ASTRA_DB_API_ENDPOINT)
62
-
63
- # vstore = AstraDBVectorStore(
64
- # collection_vector_service_options=CollectionVectorServiceOptions(
65
- # provider="azureOpenAI",
66
- # model_name="text-embedding-3-small",
67
- # authentication={
68
- # "providerKey": "AZURE_OPENAI_API_KEY",
69
- # },
70
- # parameters={
71
- # "resourceName": "openai-oe",
72
- # "deploymentId": "text-embedding-3-small",
73
- # },
74
- # ),
75
- # namespace="default_keyspace",
76
- # collection_name="article",
77
- # token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
78
- # api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
79
 
80
  def token_length(text):
81
  """
 
6
  import tiktoken
7
  import pandas as pd
8
  from langchain_astradb import AstraDBVectorStore
9
+ # from langchain_openai import AzureOpenAIEmbeddings
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain_community.document_loaders import DataFrameLoader
12
+ # from astrapy import DataAPIClient
13
+ # from astrapy.info import CollectionVectorServiceOptions
14
+ # from astrapy.exceptions import CollectionAlreadyExistsException
15
+ # from astrapy.core.api import APIRequestError
16
 
17
  logging.basicConfig(
18
  format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
 
20
  level=logging.ERROR)
21
 
22
  # from astrapy import AstraClient
23
+ # ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
24
+ # ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
25
+ # COLLECTION_NAME = "article"
26
+ # VECTOR_OPTIONS = CollectionVectorServiceOptions(
27
+ # provider="azureOpenAI",
28
+ # model_name="text-embedding-3-small",
29
+ # authentication={"providerKey": "AZURE_OPENAI_API_KEY"},
30
+ # parameters={
31
+ # "resourceName": "openai-oe",
32
+ # "deploymentId": "text-embedding-3-small",
33
+ # },
34
+ # )
35
+
36
+ # client = DataAPIClient(token=ASTRA_DB_APPLICATION_TOKEN)
37
+ # database = client.get_database(ASTRA_DB_API_ENDPOINT)
38
+
39
+ # embedding = AzureOpenAIEmbeddings(
40
+ # api_version="2024-07-01-preview",
41
+ # azure_endpoint="https://openai-oe.openai.azure.com/")
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  vstore = AstraDBVectorStore(
44
+ # collection_vector_service_options=CollectionVectorServiceOptions(
45
+ # provider="azureOpenAI",
46
+ # model_name="text-embedding-3-small",
47
+ # authentication={
48
+ # "providerKey": "AZURE_OPENAI_API_KEY",
49
+ # },
50
+ # parameters={
51
+ # "resourceName": "openai-oe",
52
+ # "deploymentId": "text-embedding-3-small",
53
+ # },
54
+ # ),
55
  namespace="default_keyspace",
56
+ collection_name="article",
57
+ token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
58
+ api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
59
+ autodetect_collection=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def token_length(text):
62
  """
test.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module to upsert data into AstraDB"""
2
+ import os
3
+ import logging
4
+
5
+ from langchain_astradb import AstraDBVectorStore
6
+ from langchain_openai import AzureOpenAIEmbeddings
7
+ from astrapy import DataAPIClient
8
+ from astrapy.info import VectorServiceOptions
9
+ # from astrapy.exceptions import CollectionAlreadyExistsException
10
+ # from astrapy.core.api import APIRequestError
11
+
12
+ logging.basicConfig(
13
+ format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
14
+ datefmt="%Y-%m-%d %H:%M:%S",
15
+ level=logging.INFO)
16
+
17
+ logging.info("Initializing AstraDB client...")
18
+
19
+ # from astrapy import AstraClient
20
+ ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
21
+ ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
22
+ COLLECTION_NAME = "article"
23
+ # VECTOR_OPTIONS = VectorServiceOptions(
24
+ # provider="azureOpenAI",
25
+ # model_name="text-embedding-3-small",
26
+ # authentication={"providerKey": "AZURE_OPENAI_API_KEY"},
27
+ # parameters={
28
+ # "resourceName": "openai-oe",
29
+ # "deploymentId": "text-embedding-3-small",
30
+ # },
31
+ # )
32
+
33
+ # client = DataAPIClient(token=ASTRA_DB_APPLICATION_TOKEN)
34
+ # database = client.get_database(ASTRA_DB_API_ENDPOINT)
35
+
36
+ # embedding = AzureOpenAIEmbeddings(
37
+ # api_version="2024-07-01-preview",
38
+ # azure_endpoint="https://openai-oe.openai.azure.com/")
39
+
40
+ # try:
41
+ # # Try to create the collection
42
+ # database.create_collection(
43
+ # name = COLLECTION_NAME,
44
+ # dimension=1536, # Default dimension for text-embedding-3-small
45
+ # metric="cosine",
46
+ # service=VECTOR_OPTIONS
47
+ # )
48
+ # logging.info("Created new collection '%s'", COLLECTION_NAME)
49
+ # except (CollectionAlreadyExistsException, APIRequestError) as e:
50
+ # logging.info("Collection '%s' already exists. Error Message: %s", COLLECTION_NAME, e)
51
+ # collection = database.get_collection(COLLECTION_NAME)
52
+
53
+ # vstore = AstraDBVectorStore(
54
+ # collection_name=COLLECTION_NAME,
55
+ # namespace="default_keyspace",
56
+ # embedding=embedding,
57
+ # token=ASTRA_DB_APPLICATION_TOKEN,
58
+ # api_endpoint=ASTRA_DB_API_ENDPOINT)
59
+
60
+ vstore = AstraDBVectorStore(
61
+ # collection_vector_service_options=VectorServiceOptions(
62
+ # provider="azureOpenAI",
63
+ # model_name="text-embedding-3-small",
64
+ # authentication={
65
+ # "providerKey": "AZURE_OPENAI_API_KEY",
66
+ # },
67
+ # parameters={
68
+ # "resourceName": "openai-oe",
69
+ # "deploymentId": "text-embedding-3-small",
70
+ # },
71
+ # ),
72
+ namespace="default_keyspace",
73
+ collection_name="article",
74
+ token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
75
+ api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
76
+ autodetect_collection=True)
77
+
78
+ results = vstore.similarity_search(query="thud",k=1)
79
+ for doc in results:
80
+ print(f"* {doc.page_content} [{doc.metadata}]")