gavinzli commited on
Commit
4f365e0
·
1 Parent(s): 1ac92c2

Refactor vectorizer to create collection in AstraDB and improve logging

Browse files
Files changed (1) hide show
  1. controllers/vectorizer.py +50 -14
controllers/vectorizer.py CHANGED
@@ -9,36 +9,72 @@ from langchain_astradb import AstraDBVectorStore
9
  from langchain_openai import AzureOpenAIEmbeddings
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain_community.document_loaders import DataFrameLoader
 
12
  from astrapy.info import CollectionVectorServiceOptions
 
13
 
14
  logging.basicConfig(
15
  format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
16
  datefmt="%Y-%m-%d %H:%M:%S",
17
  level=logging.ERROR)
18
 
 
19
  ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
20
  ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  embedding = AzureOpenAIEmbeddings(
23
  api_version="2024-07-01-preview",
24
  azure_endpoint="https://openai-oe.openai.azure.com/")
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  vstore = AstraDBVectorStore(
27
- collection_vector_service_options=CollectionVectorServiceOptions(
28
- provider="azureOpenAI",
29
- model_name="text-embedding-3-small",
30
- authentication={
31
- "providerKey": "AZURE_OPENAI_API_KEY",
32
- },
33
- parameters={
34
- "resourceName": "openai-oe",
35
- "deploymentId": "text-embedding-3-small",
36
- },
37
- ),
38
  namespace="default_keyspace",
39
- collection_name="article",
40
- token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
41
- api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def token_length(text):
44
  """
 
9
  from langchain_openai import AzureOpenAIEmbeddings
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain_community.document_loaders import DataFrameLoader
12
+ from astrapy import DataAPIClient
13
  from astrapy.info import CollectionVectorServiceOptions
14
+ from astrapy.exceptions import CollectionAlreadyExistsException
15
 
16
  logging.basicConfig(
17
  format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
18
  datefmt="%Y-%m-%d %H:%M:%S",
19
  level=logging.ERROR)
20
 
21
+ # from astrapy import AstraClient
22
  ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
23
  ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
24
+ COLLECTION_NAME = "article"
25
+ VECTOR_OPTIONS = CollectionVectorServiceOptions(
26
+ provider="azureOpenAI",
27
+ model_name="text-embedding-3-small",
28
+ authentication={"providerKey": "AZURE_OPENAI_API_KEY"},
29
+ parameters={
30
+ "resourceName": "openai-oe",
31
+ "deploymentId": "text-embedding-3-small",
32
+ },
33
+ )
34
+
35
+ client = DataAPIClient(token=ASTRA_DB_APPLICATION_TOKEN)
36
+ database = client.get_database(ASTRA_DB_API_ENDPOINT)
37
 
38
  embedding = AzureOpenAIEmbeddings(
39
  api_version="2024-07-01-preview",
40
  azure_endpoint="https://openai-oe.openai.azure.com/")
41
 
42
+ try:
43
+ # Try to create the collection
44
+ database.create_collection(
45
+ COLLECTION_NAME,
46
+ dimension=1536, # Default dimension for text-embedding-3-small
47
+ metric="cosine",
48
+ service=VECTOR_OPTIONS
49
+ )
50
+ logging.info("Created new collection '%s'", COLLECTION_NAME)
51
+ except CollectionAlreadyExistsException:
52
+ logging.info("Collection '%s' already exists. Verifying settings...", COLLECTION_NAME)
53
+ collection = database.get_collection(COLLECTION_NAME)
54
+
55
  vstore = AstraDBVectorStore(
56
+ collection_name=COLLECTION_NAME,
 
 
 
 
 
 
 
 
 
 
57
  namespace="default_keyspace",
58
+ embedding=embedding,
59
+ token=ASTRA_DB_APPLICATION_TOKEN,
60
+ api_endpoint=ASTRA_DB_API_ENDPOINT)
61
+
62
+ # vstore = AstraDBVectorStore(
63
+ # collection_vector_service_options=CollectionVectorServiceOptions(
64
+ # provider="azureOpenAI",
65
+ # model_name="text-embedding-3-small",
66
+ # authentication={
67
+ # "providerKey": "AZURE_OPENAI_API_KEY",
68
+ # },
69
+ # parameters={
70
+ # "resourceName": "openai-oe",
71
+ # "deploymentId": "text-embedding-3-small",
72
+ # },
73
+ # ),
74
+ # namespace="default_keyspace",
75
+ # collection_name="article",
76
+ # token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
77
+ # api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
78
 
79
  def token_length(text):
80
  """