Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

gavinzli commited on Dec 18, 2024

Commit

b68d569

1 Parent(s): 1c87e0d

Refactor content update process to ensure reference ID is set to None and re-enable vectorization functions in article processing

Files changed (4) hide show

controllers/utils.py CHANGED Viewed

@@ -297,8 +297,8 @@ def update_content(report):
                 'S': report['sentimentLabel']
             }
         })
-    vectorize(report)
-    openai_vectorize(report)
 def update_reference(report):
@@ -694,6 +694,10 @@ def crawl_by_url(url, article):
     article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
                                article['titleCN'] + article['publishDate'])
     logging.info("%s - %s", article['id'], article['site'])
     update_content(article)
 data = download_files_from_s3('data')

                 'S': report['sentimentLabel']
             }
         })
+    # vectorize(report)
+    # openai_vectorize(report)
 def update_reference(report):
     article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
                                article['titleCN'] + article['publishDate'])
     logging.info("%s - %s", article['id'], article['site'])
+    article['referenceid'] = None
     update_content(article)
+    vectorize(article)
+    openai_vectorize(article)
 data = download_files_from_s3('data')

controllers/vectorizer.py CHANGED Viewed

@@ -74,7 +74,7 @@ def add_documents_with_retry(chunks, ids, max_retries=3):
     """
     for attempt in range(max_retries):
         try:
-            vstore.add_documents(chunks, ids=ids)
         except (ConnectionError, TimeoutError) as e:
             logging.info("Attempt %d failed: %s", attempt + 1, e)
             if attempt < max_retries - 1:

     """
     for attempt in range(max_retries):
         try:
+            openai_vstore.add_documents(chunks, ids=ids)
         except (ConnectionError, TimeoutError) as e:
             logging.info("Attempt %d failed: %s", attempt + 1, e)
             if attempt < max_retries - 1:

source/cbirc.py CHANGED Viewed

@@ -13,6 +13,7 @@ from controllers.utils import (
         translate,
         update_content,
 )
 @task(name = "Data Collection - cbirc", log_prints = True)
 def crawl(delta):
@@ -74,6 +75,9 @@ def crawl(delta):
                             article['subtitle'] = summarize(article['content'])
                         except (RuntimeError, ValueError):
                             article['subtitle'] = translate(summary)
                         update_content(article)
                 except (ValueError, KeyError, TypeError) as error:
                     logger.error(error)

         translate,
         update_content,
 )
+from controllers.vectorizer import openai_vectorize, vectorize
 @task(name = "Data Collection - cbirc", log_prints = True)
 def crawl(delta):
                             article['subtitle'] = summarize(article['content'])
                         except (RuntimeError, ValueError):
                             article['subtitle'] = translate(summary)
+                        article['referenceid'] = None
                         update_content(article)
+                        vectorize(article)
+                        openai_vectorize(article)
                 except (ValueError, KeyError, TypeError) as error:
                     logger.error(error)

source/eastmoney.py CHANGED Viewed

@@ -22,6 +22,7 @@ from controllers.utils import (
     translate,
     update_content
 )
 with open('xpath.json', 'r', encoding='UTF-8') as f:
     xpath_dict = json.load(f)
@@ -91,12 +92,13 @@ def _crawl(url, article, retries=3):
                                article['titleCN'] + article['publishDate'])
     article['sentimentScore'], article[
         'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
     reference_id = extract_reference(article)
     if reference_id:
         article['referenceid'] = reference_id
-    else:
-        article['referenceid'] = None
     update_content(article)
 @task(name = "Data Collection - eastmoney", log_prints = True)
 def crawl(delta):

     translate,
     update_content
 )
+from controllers.vectorizer import openai_vectorize, vectorize
 with open('xpath.json', 'r', encoding='UTF-8') as f:
     xpath_dict = json.load(f)
                                article['titleCN'] + article['publishDate'])
     article['sentimentScore'], article[
         'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
+    article['referenceid'] = None
     reference_id = extract_reference(article)
     if reference_id:
         article['referenceid'] = reference_id
     update_content(article)
+    vectorize(article)
+    openai_vectorize(article)
 @task(name = "Data Collection - eastmoney", log_prints = True)
 def crawl(delta):