gavinzli commited on
Commit
93058c6
·
1 Parent(s): 472c3fb

refactor logging and streamline content update process

Browse files
Files changed (2) hide show
  1. controllers/utils.py +10 -14
  2. source/eastmoney.py +2 -2
controllers/utils.py CHANGED
@@ -231,7 +231,7 @@ def update_content(report):
231
  Returns:
232
  None
233
  """
234
- print("Updating content for %s", report['id'])
235
  dynamodb = get_client_connection()
236
  dynamodb.update_item(
237
  TableName="Article_China",
@@ -299,9 +299,6 @@ def update_content(report):
299
  'L': []
300
  }
301
  })
302
- # vectorize(report)
303
- # openai_vectorize(report)
304
-
305
 
306
  def update_reference(report):
307
  """
@@ -458,24 +455,24 @@ def extract_reference(row):
458
  # Get the pattern for the given site. If not found, skip extraction.
459
  pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
460
  if pattern is None:
461
- logging.warning(f"No reference pattern found for site {row['site']}. Skipping reference extraction.")
462
  return []
463
-
464
  # Extract text from PDF. If extraction fails, return an empty list.
465
  extracted_text = extract_from_pdf_by_pattern(row.get('attachment', ''), pattern)
466
  if not extracted_text:
467
- logging.warning(f"PDF extraction returned empty text for record {row['id']}.")
468
  return []
469
-
470
  # Now safely attempt to extract reference titles and dates.
471
  reference_titles = re.findall(pattern.get('article_regex', ''), extracted_text) or []
472
  reference_dates = re.findall(pattern.get('date_regex', ''), extracted_text) or []
473
-
474
  # Proceed only if reference_titles and reference_dates are non-empty.
475
  if not reference_titles or not reference_dates:
476
- logging.info(f"No reference titles or dates found for record {row['id']}.")
477
  return []
478
-
479
  reference_titles = [s.replace(' ', '') for s in reference_titles]
480
  reference_dates = [s.replace(' ', '') for s in reference_dates]
481
  if 'remove' in pattern:
@@ -719,8 +716,7 @@ def crawl_by_url(url, article):
719
  article['titleCN'] + article['publishDate'])
720
  logging.info("%s - %s", article['id'], article['site'])
721
  article['referenceid'] = None
722
- # update_content(article)
723
- # vectorize(article)
724
- # openai_vectorize(article)
725
 
726
  data = download_files_from_s3('data')
 
231
  Returns:
232
  None
233
  """
234
+ logging.info("Updating content for %s", report['id'])
235
  dynamodb = get_client_connection()
236
  dynamodb.update_item(
237
  TableName="Article_China",
 
299
  'L': []
300
  }
301
  })
 
 
 
302
 
303
  def update_reference(report):
304
  """
 
455
  # Get the pattern for the given site. If not found, skip extraction.
456
  pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
457
  if pattern is None:
458
+ logging.warning("No reference pattern found for site %s. Skipping reference extraction.", row['site'])
459
  return []
460
+
461
  # Extract text from PDF. If extraction fails, return an empty list.
462
  extracted_text = extract_from_pdf_by_pattern(row.get('attachment', ''), pattern)
463
  if not extracted_text:
464
+ logging.warning("PDF extraction returned empty text for record %s.", row['id'])
465
  return []
466
+
467
  # Now safely attempt to extract reference titles and dates.
468
  reference_titles = re.findall(pattern.get('article_regex', ''), extracted_text) or []
469
  reference_dates = re.findall(pattern.get('date_regex', ''), extracted_text) or []
470
+
471
  # Proceed only if reference_titles and reference_dates are non-empty.
472
  if not reference_titles or not reference_dates:
473
+ logging.info("No reference titles or dates found for record %s.", row['id'])
474
  return []
475
+
476
  reference_titles = [s.replace(' ', '') for s in reference_titles]
477
  reference_dates = [s.replace(' ', '') for s in reference_dates]
478
  if 'remove' in pattern:
 
716
  article['titleCN'] + article['publishDate'])
717
  logging.info("%s - %s", article['id'], article['site'])
718
  article['referenceid'] = None
719
+ update_content(article)
720
+ vectorize(article)
 
721
 
722
  data = download_files_from_s3('data')
source/eastmoney.py CHANGED
@@ -96,8 +96,8 @@ def _crawl(url, article, retries=3):
96
  reference_id = extract_reference(article)
97
  if reference_id:
98
  article['referenceid'] = reference_id
99
- # update_content(article)
100
- # vectorize(article)
101
  # openai_vectorize(article)
102
 
103
  @task(name = "Data Collection - eastmoney", log_prints = True)
 
96
  reference_id = extract_reference(article)
97
  if reference_id:
98
  article['referenceid'] = reference_id
99
+ update_content(article)
100
+ vectorize(article)
101
  # openai_vectorize(article)
102
 
103
  @task(name = "Data Collection - eastmoney", log_prints = True)