Refactor vectorization process by removing openai_vectorize calls and updating vectorizer initialization
Browse files- controllers/utils.py +2 -2
- controllers/vectorizer.py +45 -44
- source/eastmoney.py +2 -2
controllers/utils.py
CHANGED
@@ -22,7 +22,7 @@ import PyPDF2
|
|
22 |
from transformers import pipeline
|
23 |
|
24 |
from controllers.summarizer import summarize
|
25 |
-
from controllers.vectorizer import vectorize
|
26 |
|
27 |
load_dotenv()
|
28 |
|
@@ -703,6 +703,6 @@ def crawl_by_url(url, article):
|
|
703 |
article['referenceid'] = None
|
704 |
update_content(article)
|
705 |
vectorize(article)
|
706 |
-
openai_vectorize(article)
|
707 |
|
708 |
data = download_files_from_s3('data')
|
|
|
22 |
from transformers import pipeline
|
23 |
|
24 |
from controllers.summarizer import summarize
|
25 |
+
from controllers.vectorizer import vectorize
|
26 |
|
27 |
load_dotenv()
|
28 |
|
|
|
703 |
article['referenceid'] = None
|
704 |
update_content(article)
|
705 |
vectorize(article)
|
706 |
+
# openai_vectorize(article)
|
707 |
|
708 |
data = download_files_from_s3('data')
|
controllers/vectorizer.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
"""Module to upsert data into AstraDB"""
|
2 |
import os
|
3 |
import logging
|
4 |
-
import uuid
|
5 |
import time
|
6 |
|
7 |
import tiktoken
|
@@ -24,13 +24,13 @@ embedding = AzureOpenAIEmbeddings(
|
|
24 |
api_version="2024-07-01-preview",
|
25 |
azure_endpoint="https://openai-oe.openai.azure.com/")
|
26 |
|
27 |
-
vstore = AstraDBVectorStore(embedding=embedding,
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
|
34 |
collection_vector_service_options=CollectionVectorServiceOptions(
|
35 |
provider="azureOpenAI",
|
36 |
model_name="text-embedding-3-small",
|
@@ -43,7 +43,7 @@ openai_vstore = AstraDBVectorStore(
|
|
43 |
},
|
44 |
),
|
45 |
namespace="default_keyspace",
|
46 |
-
collection_name="
|
47 |
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
|
48 |
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
|
49 |
|
@@ -74,7 +74,7 @@ def add_documents_with_retry(chunks, ids, max_retries=3):
|
|
74 |
"""
|
75 |
for attempt in range(max_retries):
|
76 |
try:
|
77 |
-
|
78 |
except (ConnectionError, TimeoutError) as e:
|
79 |
logging.info("Attempt %d failed: %s", attempt + 1, e)
|
80 |
if attempt < max_retries - 1:
|
@@ -82,8 +82,9 @@ def add_documents_with_retry(chunks, ids, max_retries=3):
|
|
82 |
else:
|
83 |
logging.error("Max retries reached. Operation failed.")
|
84 |
logging.error(ids)
|
|
|
85 |
|
86 |
-
def
|
87 |
"""
|
88 |
Process the given article.
|
89 |
|
@@ -118,37 +119,37 @@ def openai_vectorize(article):
|
|
118 |
except (ConnectionError, TimeoutError, ValueError) as e:
|
119 |
logging.error("Failed to add documents: %s", e)
|
120 |
|
121 |
-
def vectorize(article):
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
|
|
1 |
"""Module to upsert data into AstraDB"""
|
2 |
import os
|
3 |
import logging
|
4 |
+
# import uuid
|
5 |
import time
|
6 |
|
7 |
import tiktoken
|
|
|
24 |
api_version="2024-07-01-preview",
|
25 |
azure_endpoint="https://openai-oe.openai.azure.com/")
|
26 |
|
27 |
+
# vstore = AstraDBVectorStore(embedding=embedding,
|
28 |
+
# namespace="default_keyspace",
|
29 |
+
# collection_name="FinFast_China",
|
30 |
+
# token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
|
31 |
+
# api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
|
32 |
|
33 |
+
vstore = AstraDBVectorStore(
|
34 |
collection_vector_service_options=CollectionVectorServiceOptions(
|
35 |
provider="azureOpenAI",
|
36 |
model_name="text-embedding-3-small",
|
|
|
43 |
},
|
44 |
),
|
45 |
namespace="default_keyspace",
|
46 |
+
collection_name="article",
|
47 |
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
|
48 |
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
|
49 |
|
|
|
74 |
"""
|
75 |
for attempt in range(max_retries):
|
76 |
try:
|
77 |
+
vstore.add_documents(chunks, ids=ids)
|
78 |
except (ConnectionError, TimeoutError) as e:
|
79 |
logging.info("Attempt %d failed: %s", attempt + 1, e)
|
80 |
if attempt < max_retries - 1:
|
|
|
82 |
else:
|
83 |
logging.error("Max retries reached. Operation failed.")
|
84 |
logging.error(ids)
|
85 |
+
print(ids)
|
86 |
|
87 |
+
def vectorize(article):
|
88 |
"""
|
89 |
Process the given article.
|
90 |
|
|
|
119 |
except (ConnectionError, TimeoutError, ValueError) as e:
|
120 |
logging.error("Failed to add documents: %s", e)
|
121 |
|
122 |
+
# def vectorize(article):
|
123 |
+
# """
|
124 |
+
# Process the given article.
|
125 |
+
|
126 |
+
# Parameters:
|
127 |
+
# article (DataFrame): The article to be processed.
|
128 |
+
|
129 |
+
# Returns:
|
130 |
+
# None
|
131 |
+
# """
|
132 |
+
# article['id'] = str(article['id'])
|
133 |
+
# if isinstance(article, dict):
|
134 |
+
# article = [article] # Convert single dictionary to list of dictionaries
|
135 |
+
# df = pd.DataFrame(article)
|
136 |
+
# df = df[['id','site','title','titleCN','category','author','content',
|
137 |
+
# 'publishDate','link']]
|
138 |
+
# df['publishDate'] = pd.to_datetime(df['publishDate'])
|
139 |
+
# loader = DataFrameLoader(df, page_content_column="content")
|
140 |
+
# documents = loader.load()
|
141 |
+
# text_splitter = RecursiveCharacterTextSplitter(
|
142 |
+
# chunk_size=800,
|
143 |
+
# chunk_overlap=20,
|
144 |
+
# length_function=len,
|
145 |
+
# is_separator_regex=False,
|
146 |
+
# )
|
147 |
+
|
148 |
+
# chunks = text_splitter.split_documents(documents)
|
149 |
+
# ids = []
|
150 |
+
# for chunk in chunks:
|
151 |
+
# _id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
|
152 |
+
# ids.append(_id)
|
153 |
+
# inserted_ids = vstore.add_documents(chunks, ids=ids)
|
154 |
+
# print(inserted_ids)
|
155 |
+
# logging.info(inserted_ids)
|
source/eastmoney.py
CHANGED
@@ -22,7 +22,7 @@ from controllers.utils import (
|
|
22 |
translate,
|
23 |
update_content
|
24 |
)
|
25 |
-
from controllers.vectorizer import
|
26 |
|
27 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
28 |
xpath_dict = json.load(f)
|
@@ -98,7 +98,7 @@ def _crawl(url, article, retries=3):
|
|
98 |
article['referenceid'] = reference_id
|
99 |
update_content(article)
|
100 |
vectorize(article)
|
101 |
-
openai_vectorize(article)
|
102 |
|
103 |
@task(name = "Data Collection - eastmoney", log_prints = True)
|
104 |
def crawl(delta):
|
|
|
22 |
translate,
|
23 |
update_content
|
24 |
)
|
25 |
+
from controllers.vectorizer import vectorize
|
26 |
|
27 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
28 |
xpath_dict = json.load(f)
|
|
|
98 |
article['referenceid'] = reference_id
|
99 |
update_content(article)
|
100 |
vectorize(article)
|
101 |
+
# openai_vectorize(article)
|
102 |
|
103 |
@task(name = "Data Collection - eastmoney", log_prints = True)
|
104 |
def crawl(delta):
|