wozwize's picture
updating analyzers to return flagged_phrases list for each.
55cdb25
from typing import Dict, Optional
import logging
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup, NavigableString
from ..utils.logging_config import setup_logging
class ArticleScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
setup_logging()
self.logger = logging.getLogger(__name__)
def _get_domain(self, url: str) -> str:
"""Extract domain from URL."""
return urlparse(url).netloc
def _fetch_page(self, url: str) -> Optional[str]:
"""Fetch page content with error handling."""
try:
response = self.session.get(url)
response.raise_for_status()
return response.text
except Exception as e:
self.logger.error(f"Error fetching {url}: {str(e)}")
return None
def _process_element(self, element) -> str:
"""Process an HTML element while preserving structure and formatting."""
if isinstance(element, NavigableString):
return str(element)
tag_name = element.name
if tag_name in ['p', 'div']:
return '\n\n' + ''.join(self._process_element(child) for child in element.children).strip()
elif tag_name in ['ul', 'ol']:
items = []
for li in element.find_all('li', recursive=False):
prefix = '• ' if tag_name == 'ul' else f"{len(items) + 1}. "
items.append(prefix + ''.join(self._process_element(child) for child in li.children).strip())
return '\n' + '\n'.join(items) + '\n'
elif tag_name == 'br':
return '\n'
elif tag_name in ['strong', 'b']:
return '**' + ''.join(self._process_element(child) for child in element.children) + '**'
elif tag_name in ['em', 'i']:
return '_' + ''.join(self._process_element(child) for child in element.children) + '_'
elif tag_name == 'a':
text = ''.join(self._process_element(child) for child in element.children)
href = element.get('href', '')
return f'[{text}]({href})'
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level = int(tag_name[1])
prefix = '#' * (level + 1) # Add one more # for clarity
return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
return ''.join(self._process_element(child) for child in element.children)
def _extract_content(self, container) -> str:
"""Extract and format content from a container element."""
if not container:
return ''
for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
unwanted.decompose()
content = self._process_element(container)
content = '\n'.join(line.strip() for line in content.split('\n'))
content = '\n'.join(filter(None, content.split('\n')))
return content.strip()
def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
"""Extract content from PolitiFact articles."""
try:
headline = soup.find('h1', class_='article__title') or soup.find('h1')
headline = headline.get_text(strip=True) if headline else "No headline found"
self.logger.info(f"Found headline: {headline}")
content_div = soup.find('article', class_='article') or soup.select_one('.article__text, .m-textblock')
content = self._extract_content(content_div) if content_div else "No content found"
return {"headline": headline, "content": content}
except Exception as e:
self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
def _extract_generic(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
"""Fallback extraction method for unknown domains."""
headline = soup.find('h1')
headline_text = headline.get_text().strip() if headline else "No headline found"
content_div = None
common_selectors = ['article', 'main', '.content', '.article-content']
for selector in common_selectors:
content_div = soup.select_one(selector)
if content_div:
break
content = self._extract_content(content_div) if content_div else "No content found"
return {"headline": headline_text, "content": content}
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
"""
Main function to scrape articles while maintaining structure.
Returns a dictionary with headline and content.
"""
html_content = self._fetch_page(url)
if not html_content:
self.logger.error("Failed to fetch page content")
return None
soup = BeautifulSoup(html_content, 'html.parser')
domain = self._get_domain(url)
self.logger.info(f"Scraping article from domain: {domain}")
if 'politifact.com' in domain:
return self._extract_politifact(soup)
return self._extract_generic(soup, domain)