Merge branch 'main' of https://github.com/oxbridge-econ/data-collection-china
Browse files- controllers/utils.py +21 -6
controllers/utils.py
CHANGED
@@ -455,12 +455,27 @@ def extract_reference(row):
|
|
455 |
"""
|
456 |
try:
|
457 |
print("Extracting reference for %s", row['id'])
|
458 |
-
pattern
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
reference_titles = [s.replace(' ', '') for s in reference_titles]
|
465 |
reference_dates = [s.replace(' ', '') for s in reference_dates]
|
466 |
if 'remove' in pattern:
|
|
|
455 |
"""
|
456 |
try:
|
457 |
print("Extracting reference for %s", row['id'])
|
458 |
+
# Get the pattern for the given site. If not found, skip extraction.
|
459 |
+
pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
|
460 |
+
if pattern is None:
|
461 |
+
logging.warning(f"No reference pattern found for site {row['site']}. Skipping reference extraction.")
|
462 |
+
return []
|
463 |
+
|
464 |
+
# Extract text from PDF. If extraction fails, return an empty list.
|
465 |
+
extracted_text = extract_from_pdf_by_pattern(row.get('attachment', ''), pattern)
|
466 |
+
if not extracted_text:
|
467 |
+
logging.warning(f"PDF extraction returned empty text for record {row['id']}.")
|
468 |
+
return []
|
469 |
+
|
470 |
+
# Now safely attempt to extract reference titles and dates.
|
471 |
+
reference_titles = re.findall(pattern.get('article_regex', ''), extracted_text) or []
|
472 |
+
reference_dates = re.findall(pattern.get('date_regex', ''), extracted_text) or []
|
473 |
+
|
474 |
+
# Proceed only if reference_titles and reference_dates are non-empty.
|
475 |
+
if not reference_titles or not reference_dates:
|
476 |
+
logging.info(f"No reference titles or dates found for record {row['id']}.")
|
477 |
+
return []
|
478 |
+
|
479 |
reference_titles = [s.replace(' ', '') for s in reference_titles]
|
480 |
reference_dates = [s.replace(' ', '') for s in reference_dates]
|
481 |
if 'remove' in pattern:
|