gavinzli commited on
Commit
5fea365
·
1 Parent(s): 29d3eca

Refactor vectorization process by removing openai_vectorize calls and updating vectorizer initialization

Browse files
controllers/utils.py CHANGED
@@ -22,7 +22,7 @@ import PyPDF2
22
  from transformers import pipeline
23
 
24
  from controllers.summarizer import summarize
25
- from controllers.vectorizer import vectorize, openai_vectorize
26
 
27
  load_dotenv()
28
 
@@ -703,6 +703,6 @@ def crawl_by_url(url, article):
703
  article['referenceid'] = None
704
  update_content(article)
705
  vectorize(article)
706
- openai_vectorize(article)
707
 
708
  data = download_files_from_s3('data')
 
22
  from transformers import pipeline
23
 
24
  from controllers.summarizer import summarize
25
+ from controllers.vectorizer import vectorize
26
 
27
  load_dotenv()
28
 
 
703
  article['referenceid'] = None
704
  update_content(article)
705
  vectorize(article)
706
+ # openai_vectorize(article)
707
 
708
  data = download_files_from_s3('data')
controllers/vectorizer.py CHANGED
@@ -1,7 +1,7 @@
1
  """Module to upsert data into AstraDB"""
2
  import os
3
  import logging
4
- import uuid
5
  import time
6
 
7
  import tiktoken
@@ -24,13 +24,13 @@ embedding = AzureOpenAIEmbeddings(
24
  api_version="2024-07-01-preview",
25
  azure_endpoint="https://openai-oe.openai.azure.com/")
26
 
27
- vstore = AstraDBVectorStore(embedding=embedding,
28
- namespace="default_keyspace",
29
- collection_name="FinFast_China",
30
- token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
31
- api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
32
 
33
- openai_vstore = AstraDBVectorStore(
34
  collection_vector_service_options=CollectionVectorServiceOptions(
35
  provider="azureOpenAI",
36
  model_name="text-embedding-3-small",
@@ -43,7 +43,7 @@ openai_vstore = AstraDBVectorStore(
43
  },
44
  ),
45
  namespace="default_keyspace",
46
- collection_name="text_embedding_3_small",
47
  token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
48
  api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
49
 
@@ -74,7 +74,7 @@ def add_documents_with_retry(chunks, ids, max_retries=3):
74
  """
75
  for attempt in range(max_retries):
76
  try:
77
- openai_vstore.add_documents(chunks, ids=ids)
78
  except (ConnectionError, TimeoutError) as e:
79
  logging.info("Attempt %d failed: %s", attempt + 1, e)
80
  if attempt < max_retries - 1:
@@ -82,8 +82,9 @@ def add_documents_with_retry(chunks, ids, max_retries=3):
82
  else:
83
  logging.error("Max retries reached. Operation failed.")
84
  logging.error(ids)
 
85
 
86
- def openai_vectorize(article):
87
  """
88
  Process the given article.
89
 
@@ -118,37 +119,37 @@ def openai_vectorize(article):
118
  except (ConnectionError, TimeoutError, ValueError) as e:
119
  logging.error("Failed to add documents: %s", e)
120
 
121
- def vectorize(article):
122
- """
123
- Process the given article.
124
-
125
- Parameters:
126
- article (DataFrame): The article to be processed.
127
-
128
- Returns:
129
- None
130
- """
131
- article['id'] = str(article['id'])
132
- if isinstance(article, dict):
133
- article = [article] # Convert single dictionary to list of dictionaries
134
- df = pd.DataFrame(article)
135
- df = df[['id','site','title','titleCN','category','author','content',
136
- 'publishDate','link']]
137
- df['publishDate'] = pd.to_datetime(df['publishDate'])
138
- loader = DataFrameLoader(df, page_content_column="content")
139
- documents = loader.load()
140
- text_splitter = RecursiveCharacterTextSplitter(
141
- chunk_size=800,
142
- chunk_overlap=20,
143
- length_function=len,
144
- is_separator_regex=False,
145
- )
146
-
147
- chunks = text_splitter.split_documents(documents)
148
- ids = []
149
- for chunk in chunks:
150
- _id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
151
- ids.append(_id)
152
- inserted_ids = vstore.add_documents(chunks, ids=ids)
153
- print(inserted_ids)
154
- logging.info(inserted_ids)
 
1
  """Module to upsert data into AstraDB"""
2
  import os
3
  import logging
4
+ # import uuid
5
  import time
6
 
7
  import tiktoken
 
24
  api_version="2024-07-01-preview",
25
  azure_endpoint="https://openai-oe.openai.azure.com/")
26
 
27
+ # vstore = AstraDBVectorStore(embedding=embedding,
28
+ # namespace="default_keyspace",
29
+ # collection_name="FinFast_China",
30
+ # token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
31
+ # api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
32
 
33
+ vstore = AstraDBVectorStore(
34
  collection_vector_service_options=CollectionVectorServiceOptions(
35
  provider="azureOpenAI",
36
  model_name="text-embedding-3-small",
 
43
  },
44
  ),
45
  namespace="default_keyspace",
46
+ collection_name="article",
47
  token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
48
  api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
49
 
 
74
  """
75
  for attempt in range(max_retries):
76
  try:
77
+ vstore.add_documents(chunks, ids=ids)
78
  except (ConnectionError, TimeoutError) as e:
79
  logging.info("Attempt %d failed: %s", attempt + 1, e)
80
  if attempt < max_retries - 1:
 
82
  else:
83
  logging.error("Max retries reached. Operation failed.")
84
  logging.error(ids)
85
+ print(ids)
86
 
87
+ def vectorize(article):
88
  """
89
  Process the given article.
90
 
 
119
  except (ConnectionError, TimeoutError, ValueError) as e:
120
  logging.error("Failed to add documents: %s", e)
121
 
122
+ # def vectorize(article):
123
+ # """
124
+ # Process the given article.
125
+
126
+ # Parameters:
127
+ # article (DataFrame): The article to be processed.
128
+
129
+ # Returns:
130
+ # None
131
+ # """
132
+ # article['id'] = str(article['id'])
133
+ # if isinstance(article, dict):
134
+ # article = [article] # Convert single dictionary to list of dictionaries
135
+ # df = pd.DataFrame(article)
136
+ # df = df[['id','site','title','titleCN','category','author','content',
137
+ # 'publishDate','link']]
138
+ # df['publishDate'] = pd.to_datetime(df['publishDate'])
139
+ # loader = DataFrameLoader(df, page_content_column="content")
140
+ # documents = loader.load()
141
+ # text_splitter = RecursiveCharacterTextSplitter(
142
+ # chunk_size=800,
143
+ # chunk_overlap=20,
144
+ # length_function=len,
145
+ # is_separator_regex=False,
146
+ # )
147
+
148
+ # chunks = text_splitter.split_documents(documents)
149
+ # ids = []
150
+ # for chunk in chunks:
151
+ # _id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
152
+ # ids.append(_id)
153
+ # inserted_ids = vstore.add_documents(chunks, ids=ids)
154
+ # print(inserted_ids)
155
+ # logging.info(inserted_ids)
source/eastmoney.py CHANGED
@@ -22,7 +22,7 @@ from controllers.utils import (
22
  translate,
23
  update_content
24
  )
25
- from controllers.vectorizer import openai_vectorize, vectorize
26
 
27
  with open('xpath.json', 'r', encoding='UTF-8') as f:
28
  xpath_dict = json.load(f)
@@ -98,7 +98,7 @@ def _crawl(url, article, retries=3):
98
  article['referenceid'] = reference_id
99
  update_content(article)
100
  vectorize(article)
101
- openai_vectorize(article)
102
 
103
  @task(name = "Data Collection - eastmoney", log_prints = True)
104
  def crawl(delta):
 
22
  translate,
23
  update_content
24
  )
25
+ from controllers.vectorizer import vectorize
26
 
27
  with open('xpath.json', 'r', encoding='UTF-8') as f:
28
  xpath_dict = json.load(f)
 
98
  article['referenceid'] = reference_id
99
  update_content(article)
100
  vectorize(article)
101
+ # openai_vectorize(article)
102
 
103
  @task(name = "Data Collection - eastmoney", log_prints = True)
104
  def crawl(delta):