Refactor vectorizer to create collection in AstraDB and improve logging
Browse files- controllers/vectorizer.py +50 -14
controllers/vectorizer.py
CHANGED
@@ -9,36 +9,72 @@ from langchain_astradb import AstraDBVectorStore
|
|
9 |
from langchain_openai import AzureOpenAIEmbeddings
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain_community.document_loaders import DataFrameLoader
|
|
|
12 |
from astrapy.info import CollectionVectorServiceOptions
|
|
|
13 |
|
14 |
logging.basicConfig(
|
15 |
format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
|
16 |
datefmt="%Y-%m-%d %H:%M:%S",
|
17 |
level=logging.ERROR)
|
18 |
|
|
|
19 |
ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
|
20 |
ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
embedding = AzureOpenAIEmbeddings(
|
23 |
api_version="2024-07-01-preview",
|
24 |
azure_endpoint="https://openai-oe.openai.azure.com/")
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
vstore = AstraDBVectorStore(
|
27 |
-
|
28 |
-
provider="azureOpenAI",
|
29 |
-
model_name="text-embedding-3-small",
|
30 |
-
authentication={
|
31 |
-
"providerKey": "AZURE_OPENAI_API_KEY",
|
32 |
-
},
|
33 |
-
parameters={
|
34 |
-
"resourceName": "openai-oe",
|
35 |
-
"deploymentId": "text-embedding-3-small",
|
36 |
-
},
|
37 |
-
),
|
38 |
namespace="default_keyspace",
|
39 |
-
|
40 |
-
token=
|
41 |
-
api_endpoint=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def token_length(text):
|
44 |
"""
|
|
|
9 |
from langchain_openai import AzureOpenAIEmbeddings
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain_community.document_loaders import DataFrameLoader
|
12 |
+
from astrapy import DataAPIClient
|
13 |
from astrapy.info import CollectionVectorServiceOptions
|
14 |
+
from astrapy.exceptions import CollectionAlreadyExistsException
|
15 |
|
16 |
logging.basicConfig(
|
17 |
format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
|
18 |
datefmt="%Y-%m-%d %H:%M:%S",
|
19 |
level=logging.ERROR)
|
20 |
|
21 |
+
# from astrapy import AstraClient
|
22 |
ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
|
23 |
ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
|
24 |
+
COLLECTION_NAME = "article"
|
25 |
+
VECTOR_OPTIONS = CollectionVectorServiceOptions(
|
26 |
+
provider="azureOpenAI",
|
27 |
+
model_name="text-embedding-3-small",
|
28 |
+
authentication={"providerKey": "AZURE_OPENAI_API_KEY"},
|
29 |
+
parameters={
|
30 |
+
"resourceName": "openai-oe",
|
31 |
+
"deploymentId": "text-embedding-3-small",
|
32 |
+
},
|
33 |
+
)
|
34 |
+
|
35 |
+
client = DataAPIClient(token=ASTRA_DB_APPLICATION_TOKEN)
|
36 |
+
database = client.get_database(ASTRA_DB_API_ENDPOINT)
|
37 |
|
38 |
embedding = AzureOpenAIEmbeddings(
|
39 |
api_version="2024-07-01-preview",
|
40 |
azure_endpoint="https://openai-oe.openai.azure.com/")
|
41 |
|
42 |
+
try:
|
43 |
+
# Try to create the collection
|
44 |
+
database.create_collection(
|
45 |
+
COLLECTION_NAME,
|
46 |
+
dimension=1536, # Default dimension for text-embedding-3-small
|
47 |
+
metric="cosine",
|
48 |
+
service=VECTOR_OPTIONS
|
49 |
+
)
|
50 |
+
logging.info("Created new collection '%s'", COLLECTION_NAME)
|
51 |
+
except CollectionAlreadyExistsException:
|
52 |
+
logging.info("Collection '%s' already exists. Verifying settings...", COLLECTION_NAME)
|
53 |
+
collection = database.get_collection(COLLECTION_NAME)
|
54 |
+
|
55 |
vstore = AstraDBVectorStore(
|
56 |
+
collection_name=COLLECTION_NAME,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
namespace="default_keyspace",
|
58 |
+
embedding=embedding,
|
59 |
+
token=ASTRA_DB_APPLICATION_TOKEN,
|
60 |
+
api_endpoint=ASTRA_DB_API_ENDPOINT)
|
61 |
+
|
62 |
+
# vstore = AstraDBVectorStore(
|
63 |
+
# collection_vector_service_options=CollectionVectorServiceOptions(
|
64 |
+
# provider="azureOpenAI",
|
65 |
+
# model_name="text-embedding-3-small",
|
66 |
+
# authentication={
|
67 |
+
# "providerKey": "AZURE_OPENAI_API_KEY",
|
68 |
+
# },
|
69 |
+
# parameters={
|
70 |
+
# "resourceName": "openai-oe",
|
71 |
+
# "deploymentId": "text-embedding-3-small",
|
72 |
+
# },
|
73 |
+
# ),
|
74 |
+
# namespace="default_keyspace",
|
75 |
+
# collection_name="article",
|
76 |
+
# token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
|
77 |
+
# api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
|
78 |
|
79 |
def token_length(text):
|
80 |
"""
|