Spaces:
Runtime error
Runtime error
from curl_cffi import requests as req | |
from bs4 import BeautifulSoup | |
import logging | |
from typing import Union, List, Dict, Optional | |
from urllib.parse import urljoin, urlparse | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class ScrapingError(Exception): | |
"""Custom exception for scraping errors""" | |
pass | |
def validate_url(url: str) -> bool: | |
"""Validate if the given URL is properly formatted""" | |
try: | |
result = urlparse(url) | |
return all([result.scheme, result.netloc]) | |
except Exception: | |
return False | |
def clean_url(url: str) -> str: | |
"""Clean and normalize URL""" | |
if url.startswith('//'): | |
return f'https:{url}' | |
return url | |
def scrape_html(url: str) -> Union[str, Dict[str, str]]: | |
""" | |
Fetch HTML content from a URL with improved error handling | |
Args: | |
url (str): The URL to scrape | |
Returns: | |
str: HTML content if successful | |
dict: Error information if failed | |
""" | |
try: | |
if not validate_url(url): | |
return {"error": "Invalid URL format"} | |
response = req.get( | |
url, | |
impersonate='chrome110', | |
timeout=30, | |
max_redirects=5 | |
) | |
# Check if response is HTML | |
content_type = response.headers.get('content-type', '').lower() | |
if 'text/html' not in content_type: | |
return {"error": f"Unexpected content type: {content_type}"} | |
return response.text | |
except Exception as e: | |
logger.error(f"Unexpected error while scraping {url}: {str(e)}") | |
return {"error": f"Unexpected error: {str(e)}"} | |
def scrape_images(data: str, filter: str = "") -> Union[List[str], Dict[str, str]]: | |
""" | |
Extract image URLs from HTML content with improved filtering and validation | |
Args: | |
data (str): HTML content | |
filter (str): Optional filter string for URLs | |
Returns: | |
list: List of image URLs if successful | |
dict: Error information if failed | |
""" | |
try: | |
if not data: | |
return {"error": "No HTML content provided"} | |
soup = BeautifulSoup(data, 'html.parser') | |
images = [] | |
# Look for both img tags and background images in style attributes | |
for img in soup.find_all('img'): | |
src = img.get('src') or img.get('data-src') | |
if src: | |
src = clean_url(src) | |
if validate_url(src) and (not filter or filter.lower() in src.lower()): | |
images.append(src) | |
# Look for background images in style attributes | |
for elem in soup.find_all(style=True): | |
style = elem['style'] | |
if 'background-image' in style: | |
url_start = style.find('url(') + 4 | |
url_end = style.find(')', url_start) | |
if url_start > 4 and url_end != -1: | |
src = style[url_start:url_end].strip('"\'') | |
src = clean_url(src) | |
if validate_url(src) and (not filter or filter.lower() in src.lower()): | |
images.append(src) | |
return list(set(images)) # Remove duplicates | |
except Exception as e: | |
logger.error(f"Error extracting images: {str(e)}") | |
return {"error": f"Failed to extract images: {str(e)}"} | |
def scrape_links(url: str, filter: str = "") -> Union[List[str], Dict[str, str]]: | |
""" | |
Extract links from a webpage with improved validation and error handling | |
Args: | |
url (str): URL to scrape | |
filter (str): Optional filter for links | |
Returns: | |
list: List of links if successful | |
dict: Error information if failed | |
""" | |
try: | |
if not validate_url(url): | |
return {"error": "Invalid URL format"} | |
print(url) | |
response = req.get(url, impersonate='chrome110') | |
soup = BeautifulSoup(response.text, 'html.parser') | |
links = [] | |
base_url = url | |
try: | |
for a in soup.find_all('a', href=True): | |
href = a['href'] | |
# Convert relative URLs to absolute | |
full_url = urljoin(base_url, href) | |
if validate_url(full_url) and (not filter or filter.lower() in full_url.lower()): | |
links.append(full_url) | |
return list(set(links)) # Remove duplicates | |
except Exception as e: | |
logger.error(f"Error processing links: {str(e)}") | |
return {"error": f"Failed to process links: {str(e)}"} | |
except Exception as e: | |
logger.error(f"Error extracting links: {str(e)}") | |
return {"error": f"Failed to extract links: {str(e)}"} | |
def scrape_text(data: str) -> Union[str, Dict[str, str]]: | |
""" | |
Extract clean text content from HTML | |
Args: | |
data (str): HTML content | |
Returns: | |
str: Extracted text if successful | |
dict: Error information if failed | |
""" | |
try: | |
if not data: | |
return {"error": "No HTML content provided"} | |
soup = BeautifulSoup(data, 'html.parser') | |
# Remove script and style elements | |
for element in soup(['script', 'style', 'head']): | |
element.decompose() | |
# Get text and clean it | |
text = soup.get_text(separator='\n') | |
# Remove excessive newlines and whitespace | |
text = '\n'.join(line.strip() for line in text.split('\n') if line.strip()) | |
return text | |
except Exception as e: | |
logger.error(f"Error extracting text: {str(e)}") | |
return {"error": f"Failed to extract text: {str(e)}"} | |
def scrape_div(data: str, div: str) -> Union[List[str], Dict[str, str]]: | |
""" | |
Extract content from specific div elements | |
Args: | |
data (str): HTML content | |
div (str): Class or ID of the div to scrape | |
Returns: | |
list: List of div contents if successful | |
dict: Error information if failed | |
""" | |
try: | |
if not data: | |
return {"error": "No HTML content provided"} | |
if not div: | |
return {"error": "No div selector provided"} | |
soup = BeautifulSoup(data, 'html.parser') | |
results = [] | |
# Try class first | |
elements = soup.find_all(class_=div) | |
if not elements: | |
# Try ID if no class found | |
elements = soup.find_all(id=div) | |
if not elements: | |
return {"error": f"No elements found with class or ID: {div}"} | |
for element in elements: | |
# Get both text and HTML content | |
content = { | |
"text": element.get_text(strip=True), | |
"html": str(element) | |
} | |
results.append(content) | |
return results | |
except Exception as e: | |
logger.error(f"Error extracting div content: {str(e)}") | |
return {"error": f"Failed to extract div content: {str(e)}"} | |
# Function to scrape metadata | |
def scrape_metadata(data): | |
soup = BeautifulSoup(data, 'html.parser') | |
metadata = {} | |
for meta in soup.find_all('meta'): | |
name = meta.get('name') or meta.get('property') | |
content = meta.get('content') | |
if name and content: | |
metadata[name] = content | |
return metadata | |
# Function to scrape table data | |
def scrape_tables(data): | |
soup = BeautifulSoup(data, 'html.parser') | |
tables = [] | |
for table in soup.find_all('table'): | |
rows = [] | |
for row in table.find_all('tr'): | |
cells = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])] | |
rows.append(cells) | |
tables.append(rows) | |
return tables | |