File size: 7,765 Bytes
ce4e319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
from curl_cffi import requests as req
from bs4 import BeautifulSoup
import logging
from typing import Union, List, Dict, Optional
from urllib.parse import urljoin, urlparse

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ScrapingError(Exception):
    """Custom exception for scraping errors"""
    pass

def validate_url(url: str) -> bool:
    """Validate if the given URL is properly formatted"""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except Exception:
        return False

def clean_url(url: str) -> str:
    """Clean and normalize URL"""
    if url.startswith('//'):
        return f'https:{url}'
    return url

def scrape_html(url: str) -> Union[str, Dict[str, str]]:
    """
    Fetch HTML content from a URL with improved error handling
    
    Args:
        url (str): The URL to scrape
        
    Returns:
        str: HTML content if successful
        dict: Error information if failed
    """
    try:
        if not validate_url(url):
            return {"error": "Invalid URL format"}

        response = req.get(
            url, 
            impersonate='chrome110',
            timeout=30,
            max_redirects=5
        )
        
        # Check if response is HTML
        content_type = response.headers.get('content-type', '').lower()
        if 'text/html' not in content_type:
            return {"error": f"Unexpected content type: {content_type}"}

        return response.text
    
    except Exception as e:
        logger.error(f"Unexpected error while scraping {url}: {str(e)}")
        return {"error": f"Unexpected error: {str(e)}"}

def scrape_images(data: str, filter: str = "") -> Union[List[str], Dict[str, str]]:
    """
    Extract image URLs from HTML content with improved filtering and validation
    
    Args:
        data (str): HTML content
        filter (str): Optional filter string for URLs
        
    Returns:
        list: List of image URLs if successful
        dict: Error information if failed
    """
    try:
        if not data:
            return {"error": "No HTML content provided"}

        soup = BeautifulSoup(data, 'html.parser')
        images = []
        
        # Look for both img tags and background images in style attributes
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if src:
                src = clean_url(src)
                if validate_url(src) and (not filter or filter.lower() in src.lower()):
                    images.append(src)

        # Look for background images in style attributes
        for elem in soup.find_all(style=True):
            style = elem['style']
            if 'background-image' in style:
                url_start = style.find('url(') + 4
                url_end = style.find(')', url_start)
                if url_start > 4 and url_end != -1:
                    src = style[url_start:url_end].strip('"\'')
                    src = clean_url(src)
                    if validate_url(src) and (not filter or filter.lower() in src.lower()):
                        images.append(src)

        return list(set(images))  # Remove duplicates

    except Exception as e:
        logger.error(f"Error extracting images: {str(e)}")
        return {"error": f"Failed to extract images: {str(e)}"}

def scrape_links(url: str, filter: str = "") -> Union[List[str], Dict[str, str]]:
    """
    Extract links from a webpage with improved validation and error handling
    
    Args:
        url (str): URL to scrape
        filter (str): Optional filter for links
        
    Returns:
        list: List of links if successful
        dict: Error information if failed
    """
    try:
        if not validate_url(url):
            return {"error": "Invalid URL format"}


        print(url)
        response = req.get(url, impersonate='chrome110')
        
        soup = BeautifulSoup(response.text, 'html.parser')
        links = []
        base_url = url

        try:

            for a in soup.find_all('a', href=True):
                href = a['href']
                # Convert relative URLs to absolute
                full_url = urljoin(base_url, href)
                
                if validate_url(full_url) and (not filter or filter.lower() in full_url.lower()):
                    links.append(full_url)

            return list(set(links))  # Remove duplicates
        
        except Exception as e:
            logger.error(f"Error processing links: {str(e)}")
            return {"error": f"Failed to process links: {str(e)}"}

    except Exception as e:
        logger.error(f"Error extracting links: {str(e)}")
        return {"error": f"Failed to extract links: {str(e)}"}

def scrape_text(data: str) -> Union[str, Dict[str, str]]:
    """
    Extract clean text content from HTML
    
    Args:
        data (str): HTML content
        
    Returns:
        str: Extracted text if successful
        dict: Error information if failed
    """
    try:
        if not data:
            return {"error": "No HTML content provided"}

        soup = BeautifulSoup(data, 'html.parser')
        
        # Remove script and style elements
        for element in soup(['script', 'style', 'head']):
            element.decompose()
        
        # Get text and clean it
        text = soup.get_text(separator='\n')
        # Remove excessive newlines and whitespace
        text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
        
        return text

    except Exception as e:
        logger.error(f"Error extracting text: {str(e)}")
        return {"error": f"Failed to extract text: {str(e)}"}

def scrape_div(data: str, div: str) -> Union[List[str], Dict[str, str]]:
    """
    Extract content from specific div elements
    
    Args:
        data (str): HTML content
        div (str): Class or ID of the div to scrape
        
    Returns:
        list: List of div contents if successful
        dict: Error information if failed
    """
    try:
        if not data:
            return {"error": "No HTML content provided"}
        if not div:
            return {"error": "No div selector provided"}

        soup = BeautifulSoup(data, 'html.parser')
        results = []

        # Try class first
        elements = soup.find_all(class_=div)
        if not elements:
            # Try ID if no class found
            elements = soup.find_all(id=div)
            if not elements:
                return {"error": f"No elements found with class or ID: {div}"}

        for element in elements:
            # Get both text and HTML content
            content = {
                "text": element.get_text(strip=True),
                "html": str(element)
            }
            results.append(content)

        return results

    except Exception as e:
        logger.error(f"Error extracting div content: {str(e)}")
        return {"error": f"Failed to extract div content: {str(e)}"}

# Function to scrape metadata
def scrape_metadata(data):
    soup = BeautifulSoup(data, 'html.parser')
    metadata = {}
    for meta in soup.find_all('meta'):
        name = meta.get('name') or meta.get('property')
        content = meta.get('content')
        if name and content:
            metadata[name] = content
    return metadata

# Function to scrape table data
def scrape_tables(data):
    soup = BeautifulSoup(data, 'html.parser')
    tables = []
    for table in soup.find_all('table'):
        rows = []
        for row in table.find_all('tr'):
            cells = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
            rows.append(cells)
        tables.append(rows)
    return tables