File size: 5,618 Bytes
55cdb25
876b12f
 
 
a9d5552
876b12f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9d5552
55cdb25
a9d5552
 
55cdb25
a9d5552
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55cdb25
a9d5552
 
 
 
 
 
 
 
55cdb25
a9d5552
 
55cdb25
a9d5552
 
 
 
 
 
876b12f
55cdb25
 
876b12f
55cdb25
 
a9d5552
55cdb25
876b12f
55cdb25
 
 
 
 
876b12f
55cdb25
876b12f
 
55cdb25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876b12f
 
55cdb25
876b12f
 
 
 
 
 
 
 
 
 
 
55cdb25
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from typing import Dict, Optional
import logging
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup, NavigableString

from ..utils.logging_config import setup_logging

class ArticleScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        setup_logging()
        self.logger = logging.getLogger(__name__)

    def _get_domain(self, url: str) -> str:
        """Extract domain from URL."""
        return urlparse(url).netloc

    def _fetch_page(self, url: str) -> Optional[str]:
        """Fetch page content with error handling."""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            return response.text
        except Exception as e:
            self.logger.error(f"Error fetching {url}: {str(e)}")
            return None

    def _process_element(self, element) -> str:
        """Process an HTML element while preserving structure and formatting."""
        if isinstance(element, NavigableString):
            return str(element)

        tag_name = element.name
        
        if tag_name in ['p', 'div']:
            return '\n\n' + ''.join(self._process_element(child) for child in element.children).strip()
        
        elif tag_name in ['ul', 'ol']:
            items = []
            for li in element.find_all('li', recursive=False):
                prefix = '• ' if tag_name == 'ul' else f"{len(items) + 1}. "
                items.append(prefix + ''.join(self._process_element(child) for child in li.children).strip())
            return '\n' + '\n'.join(items) + '\n'
        
        elif tag_name == 'br':
            return '\n'
        
        elif tag_name in ['strong', 'b']:
            return '**' + ''.join(self._process_element(child) for child in element.children) + '**'
        
        elif tag_name in ['em', 'i']:
            return '_' + ''.join(self._process_element(child) for child in element.children) + '_'
        
        elif tag_name == 'a':
            text = ''.join(self._process_element(child) for child in element.children)
            href = element.get('href', '')
            return f'[{text}]({href})'
        
        elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            level = int(tag_name[1])
            prefix = '#' * (level + 1)  # Add one more # for clarity
            return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
        
        return ''.join(self._process_element(child) for child in element.children)

    def _extract_content(self, container) -> str:
        """Extract and format content from a container element."""
        if not container:
            return ''
        
        for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
            unwanted.decompose()
        
        content = self._process_element(container)
        
        content = '\n'.join(line.strip() for line in content.split('\n'))
        content = '\n'.join(filter(None, content.split('\n')))
        
        return content.strip()

    def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
        """Extract content from PolitiFact articles."""
        try:
            headline = soup.find('h1', class_='article__title') or soup.find('h1')
            headline = headline.get_text(strip=True) if headline else "No headline found"
            
            self.logger.info(f"Found headline: {headline}")
            
            content_div = soup.find('article', class_='article') or soup.select_one('.article__text, .m-textblock')
            content = self._extract_content(content_div) if content_div else "No content found"
            
            return {"headline": headline, "content": content}
        
        except Exception as e:
            self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
            return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}

    def _extract_generic(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
        """Fallback extraction method for unknown domains."""
        headline = soup.find('h1')
        headline_text = headline.get_text().strip() if headline else "No headline found"
        
        content_div = None
        common_selectors = ['article', 'main', '.content', '.article-content']
        
        for selector in common_selectors:
            content_div = soup.select_one(selector)
            if content_div:
                break
        
        content = self._extract_content(content_div) if content_div else "No content found"
        
        return {"headline": headline_text, "content": content}

    def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
        """
        Main function to scrape articles while maintaining structure.
        Returns a dictionary with headline and content.
        """
        html_content = self._fetch_page(url)
        if not html_content:
            self.logger.error("Failed to fetch page content")
            return None

        soup = BeautifulSoup(html_content, 'html.parser')
        domain = self._get_domain(url)
        
        self.logger.info(f"Scraping article from domain: {domain}")
        
        if 'politifact.com' in domain:
            return self._extract_politifact(soup)
        
        return self._extract_generic(soup, domain)