Maouu's picture
Upload 24 files
ce4e319 verified
from curl_cffi import requests as req
from bs4 import BeautifulSoup
import logging
from typing import Union, List, Dict, Optional
from urllib.parse import urljoin, urlparse
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ScrapingError(Exception):
"""Custom exception for scraping errors"""
pass
def validate_url(url: str) -> bool:
"""Validate if the given URL is properly formatted"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except Exception:
return False
def clean_url(url: str) -> str:
"""Clean and normalize URL"""
if url.startswith('//'):
return f'https:{url}'
return url
def scrape_html(url: str) -> Union[str, Dict[str, str]]:
"""
Fetch HTML content from a URL with improved error handling
Args:
url (str): The URL to scrape
Returns:
str: HTML content if successful
dict: Error information if failed
"""
try:
if not validate_url(url):
return {"error": "Invalid URL format"}
response = req.get(
url,
impersonate='chrome110',
timeout=30,
max_redirects=5
)
# Check if response is HTML
content_type = response.headers.get('content-type', '').lower()
if 'text/html' not in content_type:
return {"error": f"Unexpected content type: {content_type}"}
return response.text
except Exception as e:
logger.error(f"Unexpected error while scraping {url}: {str(e)}")
return {"error": f"Unexpected error: {str(e)}"}
def scrape_images(data: str, filter: str = "") -> Union[List[str], Dict[str, str]]:
"""
Extract image URLs from HTML content with improved filtering and validation
Args:
data (str): HTML content
filter (str): Optional filter string for URLs
Returns:
list: List of image URLs if successful
dict: Error information if failed
"""
try:
if not data:
return {"error": "No HTML content provided"}
soup = BeautifulSoup(data, 'html.parser')
images = []
# Look for both img tags and background images in style attributes
for img in soup.find_all('img'):
src = img.get('src') or img.get('data-src')
if src:
src = clean_url(src)
if validate_url(src) and (not filter or filter.lower() in src.lower()):
images.append(src)
# Look for background images in style attributes
for elem in soup.find_all(style=True):
style = elem['style']
if 'background-image' in style:
url_start = style.find('url(') + 4
url_end = style.find(')', url_start)
if url_start > 4 and url_end != -1:
src = style[url_start:url_end].strip('"\'')
src = clean_url(src)
if validate_url(src) and (not filter or filter.lower() in src.lower()):
images.append(src)
return list(set(images)) # Remove duplicates
except Exception as e:
logger.error(f"Error extracting images: {str(e)}")
return {"error": f"Failed to extract images: {str(e)}"}
def scrape_links(url: str, filter: str = "") -> Union[List[str], Dict[str, str]]:
"""
Extract links from a webpage with improved validation and error handling
Args:
url (str): URL to scrape
filter (str): Optional filter for links
Returns:
list: List of links if successful
dict: Error information if failed
"""
try:
if not validate_url(url):
return {"error": "Invalid URL format"}
print(url)
response = req.get(url, impersonate='chrome110')
soup = BeautifulSoup(response.text, 'html.parser')
links = []
base_url = url
try:
for a in soup.find_all('a', href=True):
href = a['href']
# Convert relative URLs to absolute
full_url = urljoin(base_url, href)
if validate_url(full_url) and (not filter or filter.lower() in full_url.lower()):
links.append(full_url)
return list(set(links)) # Remove duplicates
except Exception as e:
logger.error(f"Error processing links: {str(e)}")
return {"error": f"Failed to process links: {str(e)}"}
except Exception as e:
logger.error(f"Error extracting links: {str(e)}")
return {"error": f"Failed to extract links: {str(e)}"}
def scrape_text(data: str) -> Union[str, Dict[str, str]]:
"""
Extract clean text content from HTML
Args:
data (str): HTML content
Returns:
str: Extracted text if successful
dict: Error information if failed
"""
try:
if not data:
return {"error": "No HTML content provided"}
soup = BeautifulSoup(data, 'html.parser')
# Remove script and style elements
for element in soup(['script', 'style', 'head']):
element.decompose()
# Get text and clean it
text = soup.get_text(separator='\n')
# Remove excessive newlines and whitespace
text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
return text
except Exception as e:
logger.error(f"Error extracting text: {str(e)}")
return {"error": f"Failed to extract text: {str(e)}"}
def scrape_div(data: str, div: str) -> Union[List[str], Dict[str, str]]:
"""
Extract content from specific div elements
Args:
data (str): HTML content
div (str): Class or ID of the div to scrape
Returns:
list: List of div contents if successful
dict: Error information if failed
"""
try:
if not data:
return {"error": "No HTML content provided"}
if not div:
return {"error": "No div selector provided"}
soup = BeautifulSoup(data, 'html.parser')
results = []
# Try class first
elements = soup.find_all(class_=div)
if not elements:
# Try ID if no class found
elements = soup.find_all(id=div)
if not elements:
return {"error": f"No elements found with class or ID: {div}"}
for element in elements:
# Get both text and HTML content
content = {
"text": element.get_text(strip=True),
"html": str(element)
}
results.append(content)
return results
except Exception as e:
logger.error(f"Error extracting div content: {str(e)}")
return {"error": f"Failed to extract div content: {str(e)}"}
# Function to scrape metadata
def scrape_metadata(data):
soup = BeautifulSoup(data, 'html.parser')
metadata = {}
for meta in soup.find_all('meta'):
name = meta.get('name') or meta.get('property')
content = meta.get('content')
if name and content:
metadata[name] = content
return metadata
# Function to scrape table data
def scrape_tables(data):
soup = BeautifulSoup(data, 'html.parser')
tables = []
for table in soup.find_all('table'):
rows = []
for row in table.find_all('tr'):
cells = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
rows.append(cells)
tables.append(rows)
return tables