Spaces:
Running
Running
File size: 9,787 Bytes
6d11371 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
import logging
import time
import re
from langdetect import detect
import spacy
from utils.performance import PerformanceTracker
from utils.models import get_nlp_model, get_llm_model
from modules.classification import normalize_tense
logger = logging.getLogger("misinformation_detector")
performance_tracker = PerformanceTracker()
def extract_claims(text):
"""
Extract the main factual claim from the provided text.
For concise claims (<30 words), preserves them exactly.
For longer text, uses OpenAI to extract the claim.
"""
logger.info(f"Extracting claims from: {text}")
start_time = time.time()
# First, check if the input already appears to be a concise claim
if len(text.split()) < 30:
logger.info("Input appears to be a concise claim already, preserving as-is")
performance_tracker.log_processing_time(start_time)
performance_tracker.log_claim_processed()
return text
try:
# For longer text, use OpenAI for extraction
extracted_claim = extract_with_openai(text)
# Log processing time
performance_tracker.log_processing_time(start_time)
performance_tracker.log_claim_processed()
logger.info(f"Extracted claim: {extracted_claim}")
return extracted_claim
except Exception as e:
logger.error(f"Error extracting claims: {str(e)}")
# Fallback to original text on error
return text
def extract_with_openai(text):
"""
Use OpenAI model for claim extraction
"""
try:
# Get LLM model
llm_model = get_llm_model()
# Create a very explicit prompt to avoid hallucination
prompt = f"""
Extract the main factual claim from the following text.
DO NOT add any information not present in the original text.
DO NOT add locations, dates, or other details.
ONLY extract what is explicitly stated.
Text: {text}
Main factual claim:
"""
# Call OpenAI with temperature=0 for deterministic output
response = llm_model.invoke(prompt, temperature=0)
extracted_claim = response.content.strip()
# Further clean up any explanations or extra text
if ":" in extracted_claim:
parts = extracted_claim.split(":")
if len(parts) > 1:
extracted_claim = parts[-1].strip()
logger.info(f"OpenAI extraction: {extracted_claim}")
# Validate that we're not adding info not in the original
nlp = get_nlp_model()
extracted_claim = validate_extraction(text, extracted_claim, nlp)
return extracted_claim
except Exception as e:
logger.error(f"Error in OpenAI claim extraction: {str(e)}")
return text # Fallback to original
def validate_extraction(original_text, extracted_claim, nlp):
"""
Validate that the extracted claim doesn't add information not present in the original text
"""
# If extraction fails or is empty, return original
if not extracted_claim or extracted_claim.strip() == "":
logger.warning("Empty extraction result, using original text")
return original_text
# Check for added location information
location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe",
"russia", "japan", "uk", "germany", "france", "australia"]
for term in location_terms:
if term in extracted_claim.lower() and term not in original_text.lower():
logger.warning(f"Extraction added location '{term}' not in original, using original text")
return original_text
# Check for entity preservation/addition using spaCy
try:
# Get entities from extracted text
extracted_doc = nlp(extracted_claim)
extracted_entities = [ent.text.lower() for ent in extracted_doc.ents]
# Get entities from original text
original_doc = nlp(original_text)
original_entities = [ent.text.lower() for ent in original_doc.ents]
# Check for new entities that don't exist in original
for entity in extracted_entities:
if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities):
logger.warning(f"Extraction added new entity '{entity}', using original text")
return original_text
return extracted_claim
except Exception as e:
logger.error(f"Error in extraction validation: {str(e)}")
return original_text # On error, safer to return original
def shorten_claim_for_evidence(claim):
"""
Shorten a claim to use for evidence retrieval by preserving important entities,
verbs, and keywords while maintaining claim context
Args:
claim (str): The original claim
Returns:
str: A shortened version of the claim optimized for evidence retrieval
"""
try:
normalized_claim = normalize_tense(claim)
# Get NLP model
nlp = get_nlp_model()
# Process claim with NLP
doc = nlp(claim)
# Components to extract
important_components = []
# 1. Extract all named entities as highest priority
entities = [ent.text for ent in doc.ents]
important_components.extend(entities)
# 2. Extract key proper nouns if not already captured in entities
for token in doc:
if token.pos_ == "PROPN" and token.text not in important_components:
important_components.append(token.text)
# 3. Extract main verbs (actions)
verbs = []
for token in doc:
if token.pos_ == "VERB" and not token.is_stop:
verbs.append(token.text)
# 4. Check for important title terms like "president", "prime minister"
title_terms = ["president", "prime minister", "minister", "chancellor", "premier",
"governor", "mayor", "senator", "CEO", "founder", "director"]
for term in title_terms:
if term in claim.lower():
# Find the full phrase (e.g., "Canadian Prime Minister")
matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim)
for match in matches:
phrase = match.group(0)
if phrase not in important_components:
important_components.append(phrase)
# 5. Add important temporal indicators
temporal_terms = ["today", "yesterday", "recently", "just", "now",
"current", "currently", "latest", "new", "week",
"month", "year", "announces", "announced", "introduces",
"introduced", "launches", "launched", "releases",
"released", "rolls out", "rolled out", "presents", "presented", "unveils", "unveiled",
"starts", "started", "begins", "began", "initiates", "initiated", "anymore"
]
# Add significant temporal context
temporal_context = []
for term in temporal_terms:
if term in claim.lower():
temporal_matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim)
for match in temporal_matches:
temporal_context.append(match.group(0))
# 6. Always include negation words as they're critical for meaning
negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"]
negation_context = []
for term in negation_terms:
if term in claim.lower():
# Find the context around the negation (3 words before and after)
neg_matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim)
for match in neg_matches:
negation_context.append(match.group(0))
# Combine all components
all_components = important_components + verbs + temporal_context + negation_context
# Remove duplicates while preserving order
seen = set()
unique_components = []
for component in all_components:
if component.lower() not in seen:
seen.add(component.lower())
unique_components.append(component)
# If we have too few components (< 2), use the original claim
if len(unique_components) < 2:
# If the claim is already short (< 10 words), use as is
if len(claim.split()) < 10:
return claim
# Otherwise, use the first 8 words
words = claim.split()
return " ".join(words[:min(8, len(words))])
# Join components to create shortened claim
# Sort components to maintain approximate original word order
def get_position(comp):
return claim.lower().find(comp.lower())
unique_components.sort(key=get_position)
shortened_claim = " ".join(unique_components)
# If the shortened claim is still too long, limit to first 10 words
if len(shortened_claim.split()) > 10:
return " ".join(shortened_claim.split()[:10])
return shortened_claim
except Exception as e:
logger.error(f"Error in shortening claim: {str(e)}")
# Return original claim on error
return claim |