Spaces:
Running
Running
File size: 5,618 Bytes
55cdb25 876b12f a9d5552 876b12f a9d5552 55cdb25 a9d5552 55cdb25 a9d5552 55cdb25 a9d5552 55cdb25 a9d5552 55cdb25 a9d5552 876b12f 55cdb25 876b12f 55cdb25 a9d5552 55cdb25 876b12f 55cdb25 876b12f 55cdb25 876b12f 55cdb25 876b12f 55cdb25 876b12f 55cdb25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
from typing import Dict, Optional
import logging
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup, NavigableString
from ..utils.logging_config import setup_logging
class ArticleScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
setup_logging()
self.logger = logging.getLogger(__name__)
def _get_domain(self, url: str) -> str:
"""Extract domain from URL."""
return urlparse(url).netloc
def _fetch_page(self, url: str) -> Optional[str]:
"""Fetch page content with error handling."""
try:
response = self.session.get(url)
response.raise_for_status()
return response.text
except Exception as e:
self.logger.error(f"Error fetching {url}: {str(e)}")
return None
def _process_element(self, element) -> str:
"""Process an HTML element while preserving structure and formatting."""
if isinstance(element, NavigableString):
return str(element)
tag_name = element.name
if tag_name in ['p', 'div']:
return '\n\n' + ''.join(self._process_element(child) for child in element.children).strip()
elif tag_name in ['ul', 'ol']:
items = []
for li in element.find_all('li', recursive=False):
prefix = '• ' if tag_name == 'ul' else f"{len(items) + 1}. "
items.append(prefix + ''.join(self._process_element(child) for child in li.children).strip())
return '\n' + '\n'.join(items) + '\n'
elif tag_name == 'br':
return '\n'
elif tag_name in ['strong', 'b']:
return '**' + ''.join(self._process_element(child) for child in element.children) + '**'
elif tag_name in ['em', 'i']:
return '_' + ''.join(self._process_element(child) for child in element.children) + '_'
elif tag_name == 'a':
text = ''.join(self._process_element(child) for child in element.children)
href = element.get('href', '')
return f'[{text}]({href})'
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level = int(tag_name[1])
prefix = '#' * (level + 1) # Add one more # for clarity
return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
return ''.join(self._process_element(child) for child in element.children)
def _extract_content(self, container) -> str:
"""Extract and format content from a container element."""
if not container:
return ''
for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
unwanted.decompose()
content = self._process_element(container)
content = '\n'.join(line.strip() for line in content.split('\n'))
content = '\n'.join(filter(None, content.split('\n')))
return content.strip()
def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
"""Extract content from PolitiFact articles."""
try:
headline = soup.find('h1', class_='article__title') or soup.find('h1')
headline = headline.get_text(strip=True) if headline else "No headline found"
self.logger.info(f"Found headline: {headline}")
content_div = soup.find('article', class_='article') or soup.select_one('.article__text, .m-textblock')
content = self._extract_content(content_div) if content_div else "No content found"
return {"headline": headline, "content": content}
except Exception as e:
self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
def _extract_generic(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
"""Fallback extraction method for unknown domains."""
headline = soup.find('h1')
headline_text = headline.get_text().strip() if headline else "No headline found"
content_div = None
common_selectors = ['article', 'main', '.content', '.article-content']
for selector in common_selectors:
content_div = soup.select_one(selector)
if content_div:
break
content = self._extract_content(content_div) if content_div else "No content found"
return {"headline": headline_text, "content": content}
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
"""
Main function to scrape articles while maintaining structure.
Returns a dictionary with headline and content.
"""
html_content = self._fetch_page(url)
if not html_content:
self.logger.error("Failed to fetch page content")
return None
soup = BeautifulSoup(html_content, 'html.parser')
domain = self._get_domain(url)
self.logger.info(f"Scraping article from domain: {domain}")
if 'politifact.com' in domain:
return self._extract_politifact(soup)
return self._extract_generic(soup, domain)
|