Refactor content update process to ensure reference ID is set to None and re-enable vectorization functions in article processing
Browse files- controllers/utils.py +6 -2
- controllers/vectorizer.py +1 -1
- source/cbirc.py +4 -0
- source/eastmoney.py +4 -2
controllers/utils.py
CHANGED
@@ -297,8 +297,8 @@ def update_content(report):
|
|
297 |
'S': report['sentimentLabel']
|
298 |
}
|
299 |
})
|
300 |
-
vectorize(report)
|
301 |
-
openai_vectorize(report)
|
302 |
|
303 |
|
304 |
def update_reference(report):
|
@@ -694,6 +694,10 @@ def crawl_by_url(url, article):
|
|
694 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
|
695 |
article['titleCN'] + article['publishDate'])
|
696 |
logging.info("%s - %s", article['id'], article['site'])
|
|
|
697 |
update_content(article)
|
|
|
|
|
|
|
698 |
|
699 |
data = download_files_from_s3('data')
|
|
|
297 |
'S': report['sentimentLabel']
|
298 |
}
|
299 |
})
|
300 |
+
# vectorize(report)
|
301 |
+
# openai_vectorize(report)
|
302 |
|
303 |
|
304 |
def update_reference(report):
|
|
|
694 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
|
695 |
article['titleCN'] + article['publishDate'])
|
696 |
logging.info("%s - %s", article['id'], article['site'])
|
697 |
+
article['referenceid'] = None
|
698 |
update_content(article)
|
699 |
+
vectorize(article)
|
700 |
+
openai_vectorize(article)
|
701 |
+
|
702 |
|
703 |
data = download_files_from_s3('data')
|
controllers/vectorizer.py
CHANGED
@@ -74,7 +74,7 @@ def add_documents_with_retry(chunks, ids, max_retries=3):
|
|
74 |
"""
|
75 |
for attempt in range(max_retries):
|
76 |
try:
|
77 |
-
|
78 |
except (ConnectionError, TimeoutError) as e:
|
79 |
logging.info("Attempt %d failed: %s", attempt + 1, e)
|
80 |
if attempt < max_retries - 1:
|
|
|
74 |
"""
|
75 |
for attempt in range(max_retries):
|
76 |
try:
|
77 |
+
openai_vstore.add_documents(chunks, ids=ids)
|
78 |
except (ConnectionError, TimeoutError) as e:
|
79 |
logging.info("Attempt %d failed: %s", attempt + 1, e)
|
80 |
if attempt < max_retries - 1:
|
source/cbirc.py
CHANGED
@@ -13,6 +13,7 @@ from controllers.utils import (
|
|
13 |
translate,
|
14 |
update_content,
|
15 |
)
|
|
|
16 |
|
17 |
@task(name = "Data Collection - cbirc", log_prints = True)
|
18 |
def crawl(delta):
|
@@ -74,6 +75,9 @@ def crawl(delta):
|
|
74 |
article['subtitle'] = summarize(article['content'])
|
75 |
except (RuntimeError, ValueError):
|
76 |
article['subtitle'] = translate(summary)
|
|
|
77 |
update_content(article)
|
|
|
|
|
78 |
except (ValueError, KeyError, TypeError) as error:
|
79 |
logger.error(error)
|
|
|
13 |
translate,
|
14 |
update_content,
|
15 |
)
|
16 |
+
from controllers.vectorizer import openai_vectorize, vectorize
|
17 |
|
18 |
@task(name = "Data Collection - cbirc", log_prints = True)
|
19 |
def crawl(delta):
|
|
|
75 |
article['subtitle'] = summarize(article['content'])
|
76 |
except (RuntimeError, ValueError):
|
77 |
article['subtitle'] = translate(summary)
|
78 |
+
article['referenceid'] = None
|
79 |
update_content(article)
|
80 |
+
vectorize(article)
|
81 |
+
openai_vectorize(article)
|
82 |
except (ValueError, KeyError, TypeError) as error:
|
83 |
logger.error(error)
|
source/eastmoney.py
CHANGED
@@ -22,6 +22,7 @@ from controllers.utils import (
|
|
22 |
translate,
|
23 |
update_content
|
24 |
)
|
|
|
25 |
|
26 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
27 |
xpath_dict = json.load(f)
|
@@ -91,12 +92,13 @@ def _crawl(url, article, retries=3):
|
|
91 |
article['titleCN'] + article['publishDate'])
|
92 |
article['sentimentScore'], article[
|
93 |
'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
|
|
|
94 |
reference_id = extract_reference(article)
|
95 |
if reference_id:
|
96 |
article['referenceid'] = reference_id
|
97 |
-
else:
|
98 |
-
article['referenceid'] = None
|
99 |
update_content(article)
|
|
|
|
|
100 |
|
101 |
@task(name = "Data Collection - eastmoney", log_prints = True)
|
102 |
def crawl(delta):
|
|
|
22 |
translate,
|
23 |
update_content
|
24 |
)
|
25 |
+
from controllers.vectorizer import openai_vectorize, vectorize
|
26 |
|
27 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
28 |
xpath_dict = json.load(f)
|
|
|
92 |
article['titleCN'] + article['publishDate'])
|
93 |
article['sentimentScore'], article[
|
94 |
'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
|
95 |
+
article['referenceid'] = None
|
96 |
reference_id = extract_reference(article)
|
97 |
if reference_id:
|
98 |
article['referenceid'] = reference_id
|
|
|
|
|
99 |
update_content(article)
|
100 |
+
vectorize(article)
|
101 |
+
openai_vectorize(article)
|
102 |
|
103 |
@task(name = "Data Collection - eastmoney", log_prints = True)
|
104 |
def crawl(delta):
|