refactor logging and streamline content update process
Browse files- controllers/utils.py +10 -14
- source/eastmoney.py +2 -2
controllers/utils.py
CHANGED
@@ -231,7 +231,7 @@ def update_content(report):
|
|
231 |
Returns:
|
232 |
None
|
233 |
"""
|
234 |
-
|
235 |
dynamodb = get_client_connection()
|
236 |
dynamodb.update_item(
|
237 |
TableName="Article_China",
|
@@ -299,9 +299,6 @@ def update_content(report):
|
|
299 |
'L': []
|
300 |
}
|
301 |
})
|
302 |
-
# vectorize(report)
|
303 |
-
# openai_vectorize(report)
|
304 |
-
|
305 |
|
306 |
def update_reference(report):
|
307 |
"""
|
@@ -458,24 +455,24 @@ def extract_reference(row):
|
|
458 |
# Get the pattern for the given site. If not found, skip extraction.
|
459 |
pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
|
460 |
if pattern is None:
|
461 |
-
logging.warning(
|
462 |
return []
|
463 |
-
|
464 |
# Extract text from PDF. If extraction fails, return an empty list.
|
465 |
extracted_text = extract_from_pdf_by_pattern(row.get('attachment', ''), pattern)
|
466 |
if not extracted_text:
|
467 |
-
logging.warning(
|
468 |
return []
|
469 |
-
|
470 |
# Now safely attempt to extract reference titles and dates.
|
471 |
reference_titles = re.findall(pattern.get('article_regex', ''), extracted_text) or []
|
472 |
reference_dates = re.findall(pattern.get('date_regex', ''), extracted_text) or []
|
473 |
-
|
474 |
# Proceed only if reference_titles and reference_dates are non-empty.
|
475 |
if not reference_titles or not reference_dates:
|
476 |
-
logging.info(
|
477 |
return []
|
478 |
-
|
479 |
reference_titles = [s.replace(' ', '') for s in reference_titles]
|
480 |
reference_dates = [s.replace(' ', '') for s in reference_dates]
|
481 |
if 'remove' in pattern:
|
@@ -719,8 +716,7 @@ def crawl_by_url(url, article):
|
|
719 |
article['titleCN'] + article['publishDate'])
|
720 |
logging.info("%s - %s", article['id'], article['site'])
|
721 |
article['referenceid'] = None
|
722 |
-
|
723 |
-
|
724 |
-
# openai_vectorize(article)
|
725 |
|
726 |
data = download_files_from_s3('data')
|
|
|
231 |
Returns:
|
232 |
None
|
233 |
"""
|
234 |
+
logging.info("Updating content for %s", report['id'])
|
235 |
dynamodb = get_client_connection()
|
236 |
dynamodb.update_item(
|
237 |
TableName="Article_China",
|
|
|
299 |
'L': []
|
300 |
}
|
301 |
})
|
|
|
|
|
|
|
302 |
|
303 |
def update_reference(report):
|
304 |
"""
|
|
|
455 |
# Get the pattern for the given site. If not found, skip extraction.
|
456 |
pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
|
457 |
if pattern is None:
|
458 |
+
logging.warning("No reference pattern found for site %s. Skipping reference extraction.", row['site'])
|
459 |
return []
|
460 |
+
|
461 |
# Extract text from PDF. If extraction fails, return an empty list.
|
462 |
extracted_text = extract_from_pdf_by_pattern(row.get('attachment', ''), pattern)
|
463 |
if not extracted_text:
|
464 |
+
logging.warning("PDF extraction returned empty text for record %s.", row['id'])
|
465 |
return []
|
466 |
+
|
467 |
# Now safely attempt to extract reference titles and dates.
|
468 |
reference_titles = re.findall(pattern.get('article_regex', ''), extracted_text) or []
|
469 |
reference_dates = re.findall(pattern.get('date_regex', ''), extracted_text) or []
|
470 |
+
|
471 |
# Proceed only if reference_titles and reference_dates are non-empty.
|
472 |
if not reference_titles or not reference_dates:
|
473 |
+
logging.info("No reference titles or dates found for record %s.", row['id'])
|
474 |
return []
|
475 |
+
|
476 |
reference_titles = [s.replace(' ', '') for s in reference_titles]
|
477 |
reference_dates = [s.replace(' ', '') for s in reference_dates]
|
478 |
if 'remove' in pattern:
|
|
|
716 |
article['titleCN'] + article['publishDate'])
|
717 |
logging.info("%s - %s", article['id'], article['site'])
|
718 |
article['referenceid'] = None
|
719 |
+
update_content(article)
|
720 |
+
vectorize(article)
|
|
|
721 |
|
722 |
data = download_files_from_s3('data')
|
source/eastmoney.py
CHANGED
@@ -96,8 +96,8 @@ def _crawl(url, article, retries=3):
|
|
96 |
reference_id = extract_reference(article)
|
97 |
if reference_id:
|
98 |
article['referenceid'] = reference_id
|
99 |
-
|
100 |
-
|
101 |
# openai_vectorize(article)
|
102 |
|
103 |
@task(name = "Data Collection - eastmoney", log_prints = True)
|
|
|
96 |
reference_id = extract_reference(article)
|
97 |
if reference_id:
|
98 |
article['referenceid'] = reference_id
|
99 |
+
update_content(article)
|
100 |
+
vectorize(article)
|
101 |
# openai_vectorize(article)
|
102 |
|
103 |
@task(name = "Data Collection - eastmoney", log_prints = True)
|