Spaces:
Running
Running
import feedparser | |
import time | |
import logging | |
import re | |
import ssl | |
import requests | |
from datetime import datetime, timedelta | |
from threading import Timer | |
from urllib.parse import urlparse | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
logger = logging.getLogger("misinformation_detector") | |
# Disable SSL certificate verification for feeds with self-signed certs | |
ssl._create_default_https_context = ssl._create_unverified_context | |
# List of RSS feeds to check for news | |
# These are popular news sources with reliable and frequently updated RSS feeds | |
RSS_FEEDS = [ | |
# -------------------- | |
# ๐ General World News | |
# -------------------- | |
"http://rss.cnn.com/rss/cnn_world.rss", # CNN World News | |
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml", # NYT World News | |
"https://feeds.washingtonpost.com/rss/world", # The Washington Post World News | |
"https://feeds.bbci.co.uk/news/world/rss.xml", # BBC News - World | |
# -------------------- | |
# ๐ง Tech & Startup News (Global) | |
# -------------------- | |
"https://techcrunch.com/feed/", # TechCrunch - Startup and Technology News | |
"https://venturebeat.com/feed/", # VentureBeat - Tech News | |
"https://www.wired.com/feed/rss", # Wired - Technology News | |
"https://www.cnet.com/rss/news/", # CNET - Technology News | |
"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=t&hl=en-IN", # Google News India - Technology | |
"https://news.google.com/rss?gl=US&ceid=US:en&topic=t&hl=en-US", # Google News US - Technology | |
# -------------------- | |
# ๐ผ Startup & VC Focused | |
# -------------------- | |
"https://news.crunchbase.com/feed/", # Crunchbase News - Startup Funding | |
"https://techstartups.com/feed/", # Tech Startups - Startup News | |
# -------------------- | |
# ๐ฐ Global Business & Corporate Feeds | |
# -------------------- | |
"https://feeds.bloomberg.com/technology/news.rss", # Bloomberg Technology News | |
"https://www.ft.com/technology?format=rss", # Financial Times Technology News | |
"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=b&hl=en-IN", # Google News India - Business | |
# -------------------- | |
# ๐ฎ๐ณ India-specific News | |
# -------------------- | |
"https://inc42.com/feed/", # Inc42 - Indian Startups and Technology | |
"https://timesofindia.indiatimes.com/rssfeedstopstories.cms", # TOI - Top Stories | |
"https://timesofindia.indiatimes.com/rssfeedmostrecent.cms", # TOI - Most Recent Stories | |
"https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms", # TOI - India News | |
"https://timesofindia.indiatimes.com/rssfeeds/296589292.cms", # TOI - World News | |
"https://timesofindia.indiatimes.com/rssfeeds/1898055.cms", # TOI - Business News | |
"https://timesofindia.indiatimes.com/rssfeeds/54829575.cms", # TOI - Cricket News | |
"https://timesofindia.indiatimes.com/rssfeeds/4719148.cms", # TOI - Sports News | |
"https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms", # TOI - Science News | |
# -------------------- | |
# ๐ Sports News (Global + Cricket) | |
# -------------------- | |
"https://www.espn.com/espn/rss/news", # ESPN - Top Sports News | |
"https://feeds.skynews.com/feeds/rss/sports.xml", # Sky News - Sports | |
"https://sports.ndtv.com/rss/all", # NDTV Sports | |
"https://www.espncricinfo.com/rss/content/story/feeds/0.xml", # ESPN Cricinfo - Cricket News | |
# -------------------- | |
# โ Fact-Checking Sources | |
# -------------------- | |
"https://www.snopes.com/feed/", # Snopes - Fact Checking | |
"https://www.politifact.com/rss/all/", # PolitiFact - Fact Checking | |
"https://www.factcheck.org/feed/", # FactCheck - Fact Checking | |
"https://leadstories.com/atom.xml", # Lead Stories - Fact Checking | |
"https://fullfact.org/feed/all/", # Full Fact - Fact Checking | |
"https://www.truthorfiction.com/feed/", # TruthOrFiction - Fact Checking | |
# -------------------- | |
# ๐ณ๏ธ Politics & Policy (General) | |
# -------------------- | |
"https://feeds.bbci.co.uk/news/politics/rss.xml", # BBC News - Politics | |
"https://feeds.bbci.co.uk/news/science_and_environment/rss.xml", # BBC - Science & Environment | |
# -------------------- | |
# ๐ณ๏ธ Science | |
# -------------------- | |
"https://www.nature.com/nature.rss", # Nature science | |
"https://feeds.science.org/rss/science-advances.xml" # science.org | |
] | |
def clean_html(raw_html): | |
"""Remove HTML tags from text""" | |
if not raw_html: | |
return "" | |
clean_regex = re.compile('<.*?>') | |
clean_text = re.sub(clean_regex, '', raw_html) | |
# Remove extra whitespace | |
clean_text = re.sub(r'\s+', ' ', clean_text).strip() | |
return clean_text | |
def parse_feed(feed_url, timeout=5): | |
""" | |
Parse a single RSS feed with proper timeout handling | |
Uses requests with timeout first, then passes content to feedparser | |
""" | |
try: | |
# Use requests with timeout to fetch the RSS content | |
response = requests.get(feed_url, timeout=timeout) | |
response.raise_for_status() | |
# Then parse the content with feedparser (which doesn't support timeout) | |
feed = feedparser.parse(response.content) | |
# Basic validation of the feed | |
if hasattr(feed, 'entries') and feed.entries: | |
return feed | |
else: | |
logger.warning(f"Feed {feed_url} parsed but contains no entries") | |
return None | |
except requests.exceptions.Timeout: | |
logger.warning(f"Timeout while fetching feed {feed_url}") | |
return None | |
except requests.exceptions.RequestException as e: | |
logger.error(f"Request error fetching feed {feed_url}: {str(e)}") | |
return None | |
except Exception as e: | |
logger.error(f"Error parsing feed {feed_url}: {str(e)}") | |
return None | |
def fetch_all_feeds(feeds_list=None, max_workers=5, timeout=5): | |
""" | |
Fetch multiple RSS feeds with proper timeout handling | |
Returns a list of (domain, feed) tuples for successfully fetched feeds | |
""" | |
# Use default RSS_FEEDS list if none provided | |
if feeds_list is None: | |
feeds_list = RSS_FEEDS | |
results = [] | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
future_to_url = {executor.submit(parse_feed, url, timeout): url for url in feeds_list} | |
for future in as_completed(future_to_url): | |
url = future_to_url[future] | |
try: | |
feed = future.result() | |
if feed and hasattr(feed, 'entries') and feed.entries: | |
# Extract domain for source attribution | |
domain = urlparse(url).netloc | |
results.append((domain, feed)) | |
logger.info(f"Successfully fetched {domain} with {len(feed.entries)} entries") | |
except Exception as e: | |
logger.error(f"Error processing {url}: {str(e)}") | |
return results | |
def extract_date(entry): | |
"""Extract and normalize publication date from entry""" | |
for date_field in ['published_parsed', 'updated_parsed', 'created_parsed']: | |
if hasattr(entry, date_field) and getattr(entry, date_field): | |
try: | |
# Convert time tuple to datetime | |
time_tuple = getattr(entry, date_field) | |
return datetime(time_tuple[0], time_tuple[1], time_tuple[2], | |
time_tuple[3], time_tuple[4], time_tuple[5]) | |
except Exception as e: | |
logger.debug(f"Error parsing {date_field}: {e}") | |
continue | |
# Try string dates | |
for date_field in ['published', 'updated', 'pubDate']: | |
if hasattr(entry, date_field) and getattr(entry, date_field): | |
try: | |
date_str = getattr(entry, date_field) | |
# Try various formats | |
for fmt in ['%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z', | |
'%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z']: | |
try: | |
return datetime.strptime(date_str, fmt) | |
except: | |
continue | |
except Exception as e: | |
logger.debug(f"Error parsing date string {date_field}: {e}") | |
continue | |
# Default to current time if parsing fails | |
return datetime.now() | |
def is_recent(entry_date, claim=None, max_days=3): | |
""" | |
Check if an entry is recent based on temporal indicators in the claim. | |
Args: | |
entry_date (datetime): The date of the entry to check | |
claim (str, optional): The claim text to analyze for temporal indicators | |
max_days (int, optional): Default maximum age in days | |
Returns: | |
bool: True if entry is considered recent, False otherwise | |
""" | |
if not entry_date: | |
return False | |
# Default max days if no claim is provided | |
default_days = max_days | |
extended_days = 15 # For 'recently', 'this week', etc. | |
if claim: | |
# Specific day indicators get default days | |
specific_day_terms = ["today", "yesterday", "day before yesterday"] | |
# Extended time terms get extended days | |
extended_time_terms = [ | |
"recently", "currently", "freshly", "this week", "few days", | |
"couple of days", "last week", "past week", "several days", | |
"anymore" | |
] | |
claim_lower = claim.lower() | |
# Check for extended time terms first, then specific day terms | |
if any(term in claim_lower for term in extended_time_terms): | |
cutoff = datetime.now() - timedelta(days=extended_days) | |
return entry_date > cutoff | |
elif any(term in claim_lower for term in specific_day_terms): | |
cutoff = datetime.now() - timedelta(days=default_days) | |
return entry_date > cutoff | |
# Default case - use standard window | |
cutoff = datetime.now() - timedelta(days=default_days) | |
return entry_date > cutoff | |
def get_entry_relevance(entry, query_terms, domain): | |
"""Calculate relevance score for an entry based on query match and recency""" | |
if not hasattr(entry, 'title') or not entry.title: | |
return 0 | |
# Extract text content | |
title = entry.title or "" | |
description = clean_html(entry.description) if hasattr(entry, 'description') else "" | |
content = "" | |
if hasattr(entry, 'content'): | |
for content_item in entry.content: | |
if 'value' in content_item: | |
content += clean_html(content_item['value']) + " " | |
# Extract published date | |
pub_date = extract_date(entry) | |
# Calculate recency score (0-1) | |
recency_score = 0 | |
if pub_date: | |
days_old = (datetime.now() - pub_date).days | |
if days_old <= 1: # Today or yesterday | |
recency_score = 1.0 | |
elif days_old <= 2: | |
recency_score = 0.8 | |
elif days_old <= 3: | |
recency_score = 0.5 | |
else: | |
recency_score = 0.2 | |
# Calculate relevance score based on keyword matches | |
text = f"{title} {description} {content}".lower() | |
# Count how many query terms appear in the content | |
query_terms_lower = [term.lower() for term in query_terms] | |
matches = sum(1 for term in query_terms_lower if term in text) | |
# Calculate match score (0-1) | |
match_score = min(1.0, matches / max(1, len(query_terms) * 0.7)) | |
# Boost score for exact phrase matches | |
query_phrase = " ".join(query_terms_lower) | |
if query_phrase in text: | |
match_score += 0.5 | |
# Additional boost for title matches (they're more relevant) | |
title_matches = sum(1 for term in query_terms_lower if term in title.lower()) | |
if title_matches > 0: | |
match_score += 0.2 * (title_matches / len(query_terms_lower)) | |
# Source quality factor (can be adjusted based on source reliability) | |
source_factor = 1.0 | |
high_quality_domains = ['bbc.co.uk', 'nytimes.com', 'reuters.com', 'washingtonpost.com', | |
'espncricinfo.com', 'cricbuzz.com', 'snopes.com'] | |
if any(quality_domain in domain for quality_domain in high_quality_domains): | |
source_factor = 1.2 | |
# Calculate final score | |
final_score = (match_score * 0.6) + (recency_score * 0.4) * source_factor | |
return min(1.0, final_score) # Cap at 1.0 | |
def retrieve_evidence_from_rss(claim, max_results=10, category_feeds=None): | |
""" | |
Retrieve evidence from RSS feeds for a given claim | |
Args: | |
claim (str): The claim to verify | |
max_results (int): Maximum number of results to return | |
category_feeds (list, optional): List of category-specific RSS feeds to check | |
Returns: | |
list: List of relevant evidence items | |
""" | |
start_time = time.time() | |
logger.info(f"Retrieving evidence from RSS feeds for: {claim}") | |
# Extract key terms from claim | |
terms = [term.strip() for term in re.findall(r'\b\w+\b', claim) if len(term.strip()) > 2] | |
try: | |
# Use category-specific feeds if provided | |
feeds_to_use = category_feeds if category_feeds else RSS_FEEDS | |
# Log which feeds we're using | |
if category_feeds: | |
logger.info(f"Using {len(category_feeds)} category-specific RSS feeds") | |
else: | |
logger.info(f"Using {len(RSS_FEEDS)} default RSS feeds") | |
# Limit the number of feeds to process for efficiency | |
if len(feeds_to_use) > 10: | |
# If we have too many feeds, select a subset | |
# Prioritize fact-checking sources | |
fact_check_feeds = [feed for feed in feeds_to_use if "fact" in feed.lower() or "snopes" in feed.lower() or "politifact" in feed.lower()] | |
other_feeds = [feed for feed in feeds_to_use if feed not in fact_check_feeds] | |
# Take all fact-checking feeds plus a random selection of others | |
import random | |
selected_feeds = fact_check_feeds + random.sample(other_feeds, min(10 - len(fact_check_feeds), len(other_feeds))) | |
else: | |
selected_feeds = feeds_to_use | |
# Fetch all feeds in parallel with the selected feeds | |
feeds = fetch_all_feeds(selected_feeds) | |
if not feeds: | |
logger.warning("No RSS feeds could be fetched") | |
return [] | |
all_entries = [] | |
# Process all feed entries | |
for domain, feed in feeds: | |
for entry in feed.entries: | |
# Calculate relevance score | |
relevance = get_entry_relevance(entry, terms, domain) | |
if relevance > 0.3: # Only consider somewhat relevant entries | |
# Extract entry details | |
title = entry.title if hasattr(entry, 'title') else "No title" | |
link = entry.link if hasattr(entry, 'link') else "" | |
# Extract and clean description/content | |
description = "" | |
if hasattr(entry, 'description'): | |
description = clean_html(entry.description) | |
elif hasattr(entry, 'summary'): | |
description = clean_html(entry.summary) | |
elif hasattr(entry, 'content'): | |
for content_item in entry.content: | |
if 'value' in content_item: | |
description += clean_html(content_item['value']) + " " | |
# Truncate description if too long | |
if len(description) > 1000: | |
description = description[:1000] + "..." | |
# Get publication date | |
pub_date = extract_date(entry) | |
date_str = pub_date.strftime('%Y-%m-%d') if pub_date else "Unknown date" | |
# Format as evidence text | |
evidence_text = ( | |
f"Title: {title}, " | |
f"Source: {domain} (RSS), " | |
f"Date: {date_str}, " | |
f"URL: {link}, " | |
f"Content: {description}" | |
) | |
all_entries.append({ | |
"text": evidence_text, | |
"relevance": relevance, | |
"date": pub_date or datetime.now() | |
}) | |
# Sort entries by relevance | |
all_entries.sort(key=lambda x: x["relevance"], reverse=True) | |
# Take top results | |
top_entries = all_entries[:max_results] | |
logger.info(f"Retrieved {len(top_entries)} relevant RSS items from {len(feeds)} feeds in {time.time() - start_time:.2f}s") | |
# Return just the text portion | |
return [entry["text"] for entry in top_entries] | |
except Exception as e: | |
logger.error(f"Error in RSS retrieval: {str(e)}") | |
return [] |