gavinzli commited on
Commit
b68d569
·
1 Parent(s): 1c87e0d

Refactor content update process to ensure reference ID is set to None and re-enable vectorization functions in article processing

Browse files
controllers/utils.py CHANGED
@@ -297,8 +297,8 @@ def update_content(report):
297
  'S': report['sentimentLabel']
298
  }
299
  })
300
- vectorize(report)
301
- openai_vectorize(report)
302
 
303
 
304
  def update_reference(report):
@@ -694,6 +694,10 @@ def crawl_by_url(url, article):
694
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
695
  article['titleCN'] + article['publishDate'])
696
  logging.info("%s - %s", article['id'], article['site'])
 
697
  update_content(article)
 
 
 
698
 
699
  data = download_files_from_s3('data')
 
297
  'S': report['sentimentLabel']
298
  }
299
  })
300
+ # vectorize(report)
301
+ # openai_vectorize(report)
302
 
303
 
304
  def update_reference(report):
 
694
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
695
  article['titleCN'] + article['publishDate'])
696
  logging.info("%s - %s", article['id'], article['site'])
697
+ article['referenceid'] = None
698
  update_content(article)
699
+ vectorize(article)
700
+ openai_vectorize(article)
701
+
702
 
703
  data = download_files_from_s3('data')
controllers/vectorizer.py CHANGED
@@ -74,7 +74,7 @@ def add_documents_with_retry(chunks, ids, max_retries=3):
74
  """
75
  for attempt in range(max_retries):
76
  try:
77
- vstore.add_documents(chunks, ids=ids)
78
  except (ConnectionError, TimeoutError) as e:
79
  logging.info("Attempt %d failed: %s", attempt + 1, e)
80
  if attempt < max_retries - 1:
 
74
  """
75
  for attempt in range(max_retries):
76
  try:
77
+ openai_vstore.add_documents(chunks, ids=ids)
78
  except (ConnectionError, TimeoutError) as e:
79
  logging.info("Attempt %d failed: %s", attempt + 1, e)
80
  if attempt < max_retries - 1:
source/cbirc.py CHANGED
@@ -13,6 +13,7 @@ from controllers.utils import (
13
  translate,
14
  update_content,
15
  )
 
16
 
17
  @task(name = "Data Collection - cbirc", log_prints = True)
18
  def crawl(delta):
@@ -74,6 +75,9 @@ def crawl(delta):
74
  article['subtitle'] = summarize(article['content'])
75
  except (RuntimeError, ValueError):
76
  article['subtitle'] = translate(summary)
 
77
  update_content(article)
 
 
78
  except (ValueError, KeyError, TypeError) as error:
79
  logger.error(error)
 
13
  translate,
14
  update_content,
15
  )
16
+ from controllers.vectorizer import openai_vectorize, vectorize
17
 
18
  @task(name = "Data Collection - cbirc", log_prints = True)
19
  def crawl(delta):
 
75
  article['subtitle'] = summarize(article['content'])
76
  except (RuntimeError, ValueError):
77
  article['subtitle'] = translate(summary)
78
+ article['referenceid'] = None
79
  update_content(article)
80
+ vectorize(article)
81
+ openai_vectorize(article)
82
  except (ValueError, KeyError, TypeError) as error:
83
  logger.error(error)
source/eastmoney.py CHANGED
@@ -22,6 +22,7 @@ from controllers.utils import (
22
  translate,
23
  update_content
24
  )
 
25
 
26
  with open('xpath.json', 'r', encoding='UTF-8') as f:
27
  xpath_dict = json.load(f)
@@ -91,12 +92,13 @@ def _crawl(url, article, retries=3):
91
  article['titleCN'] + article['publishDate'])
92
  article['sentimentScore'], article[
93
  'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
 
94
  reference_id = extract_reference(article)
95
  if reference_id:
96
  article['referenceid'] = reference_id
97
- else:
98
- article['referenceid'] = None
99
  update_content(article)
 
 
100
 
101
  @task(name = "Data Collection - eastmoney", log_prints = True)
102
  def crawl(delta):
 
22
  translate,
23
  update_content
24
  )
25
+ from controllers.vectorizer import openai_vectorize, vectorize
26
 
27
  with open('xpath.json', 'r', encoding='UTF-8') as f:
28
  xpath_dict = json.load(f)
 
92
  article['titleCN'] + article['publishDate'])
93
  article['sentimentScore'], article[
94
  'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
95
+ article['referenceid'] = None
96
  reference_id = extract_reference(article)
97
  if reference_id:
98
  article['referenceid'] = reference_id
 
 
99
  update_content(article)
100
+ vectorize(article)
101
+ openai_vectorize(article)
102
 
103
  @task(name = "Data Collection - eastmoney", log_prints = True)
104
  def crawl(delta):