gavinzli commited on
Commit
236ef33
·
2 Parent(s): 4a26d9b 32cebdb

Merge branch 'main' of https://github.com/oxbridge-econ/data-collection-china

Browse files
Files changed (1) hide show
  1. controllers/utils.py +21 -6
controllers/utils.py CHANGED
@@ -455,12 +455,27 @@ def extract_reference(row):
455
  """
456
  try:
457
  print("Extracting reference for %s", row['id'])
458
- pattern = next(
459
- (elem for elem in patterns if elem['site'] == row['site']), None)
460
- extracted_text = extract_from_pdf_by_pattern(row['attachment'],
461
- pattern)
462
- reference_titles = re.findall(pattern['article_regex'], extracted_text)
463
- reference_dates = re.findall(pattern['date_regex'], extracted_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  reference_titles = [s.replace(' ', '') for s in reference_titles]
465
  reference_dates = [s.replace(' ', '') for s in reference_dates]
466
  if 'remove' in pattern:
 
455
  """
456
  try:
457
  print("Extracting reference for %s", row['id'])
458
+ # Get the pattern for the given site. If not found, skip extraction.
459
+ pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
460
+ if pattern is None:
461
+ logging.warning(f"No reference pattern found for site {row['site']}. Skipping reference extraction.")
462
+ return []
463
+
464
+ # Extract text from PDF. If extraction fails, return an empty list.
465
+ extracted_text = extract_from_pdf_by_pattern(row.get('attachment', ''), pattern)
466
+ if not extracted_text:
467
+ logging.warning(f"PDF extraction returned empty text for record {row['id']}.")
468
+ return []
469
+
470
+ # Now safely attempt to extract reference titles and dates.
471
+ reference_titles = re.findall(pattern.get('article_regex', ''), extracted_text) or []
472
+ reference_dates = re.findall(pattern.get('date_regex', ''), extracted_text) or []
473
+
474
+ # Proceed only if reference_titles and reference_dates are non-empty.
475
+ if not reference_titles or not reference_dates:
476
+ logging.info(f"No reference titles or dates found for record {row['id']}.")
477
+ return []
478
+
479
  reference_titles = [s.replace(' ', '') for s in reference_titles]
480
  reference_dates = [s.replace(' ', '') for s in reference_dates]
481
  if 'remove' in pattern: