Spaces:

Maouu
/

flowify-backend

Runtime error

App Files Files Community

flowify-backend / nodes /scraping /html.py

Maouu

Upload 24 files

ce4e319 verified 28 days ago

raw

history blame contribute delete

7.77 kB

	from curl_cffi import requests as req
	from bs4 import BeautifulSoup
	import logging
	from typing import Union, List, Dict, Optional
	from urllib.parse import urljoin, urlparse

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class ScrapingError(Exception):
	"""Custom exception for scraping errors"""
	pass

	def validate_url(url: str) -> bool:
	"""Validate if the given URL is properly formatted"""
	try:
	result = urlparse(url)
	return all([result.scheme, result.netloc])
	except Exception:
	return False

	def clean_url(url: str) -> str:
	"""Clean and normalize URL"""
	if url.startswith('//'):
	return f'https:{url}'
	return url

	def scrape_html(url: str) -> Union[str, Dict[str, str]]:
	"""
	Fetch HTML content from a URL with improved error handling

	Args:
	url (str): The URL to scrape

	Returns:
	str: HTML content if successful
	dict: Error information if failed
	"""
	try:
	if not validate_url(url):
	return {"error": "Invalid URL format"}

	response = req.get(
	url,
	impersonate='chrome110',
	timeout=30,
	max_redirects=5
	)

	# Check if response is HTML
	content_type = response.headers.get('content-type', '').lower()
	if 'text/html' not in content_type:
	return {"error": f"Unexpected content type: {content_type}"}

	return response.text

	except Exception as e:
	logger.error(f"Unexpected error while scraping {url}: {str(e)}")
	return {"error": f"Unexpected error: {str(e)}"}

	def scrape_images(data: str, filter: str = "") -> Union[List[str], Dict[str, str]]:
	"""
	Extract image URLs from HTML content with improved filtering and validation

	Args:
	data (str): HTML content
	filter (str): Optional filter string for URLs

	Returns:
	list: List of image URLs if successful
	dict: Error information if failed
	"""
	try:
	if not data:
	return {"error": "No HTML content provided"}

	soup = BeautifulSoup(data, 'html.parser')
	images = []

	# Look for both img tags and background images in style attributes
	for img in soup.find_all('img'):
	src = img.get('src') or img.get('data-src')
	if src:
	src = clean_url(src)
	if validate_url(src) and (not filter or filter.lower() in src.lower()):
	images.append(src)

	# Look for background images in style attributes
	for elem in soup.find_all(style=True):
	style = elem['style']
	if 'background-image' in style:
	url_start = style.find('url(') + 4
	url_end = style.find(')', url_start)
	if url_start > 4 and url_end != -1:
	src = style[url_start:url_end].strip('"\'')
	src = clean_url(src)
	if validate_url(src) and (not filter or filter.lower() in src.lower()):
	images.append(src)

	return list(set(images)) # Remove duplicates

	except Exception as e:
	logger.error(f"Error extracting images: {str(e)}")
	return {"error": f"Failed to extract images: {str(e)}"}

	def scrape_links(url: str, filter: str = "") -> Union[List[str], Dict[str, str]]:
	"""
	Extract links from a webpage with improved validation and error handling

	Args:
	url (str): URL to scrape
	filter (str): Optional filter for links

	Returns:
	list: List of links if successful
	dict: Error information if failed
	"""
	try:
	if not validate_url(url):
	return {"error": "Invalid URL format"}


	print(url)
	response = req.get(url, impersonate='chrome110')

	soup = BeautifulSoup(response.text, 'html.parser')
	links = []
	base_url = url

	try:

	for a in soup.find_all('a', href=True):
	href = a['href']
	# Convert relative URLs to absolute
	full_url = urljoin(base_url, href)

	if validate_url(full_url) and (not filter or filter.lower() in full_url.lower()):
	links.append(full_url)

	return list(set(links)) # Remove duplicates

	except Exception as e:
	logger.error(f"Error processing links: {str(e)}")
	return {"error": f"Failed to process links: {str(e)}"}

	except Exception as e:
	logger.error(f"Error extracting links: {str(e)}")
	return {"error": f"Failed to extract links: {str(e)}"}

	def scrape_text(data: str) -> Union[str, Dict[str, str]]:
	"""
	Extract clean text content from HTML

	Args:
	data (str): HTML content

	Returns:
	str: Extracted text if successful
	dict: Error information if failed
	"""
	try:
	if not data:
	return {"error": "No HTML content provided"}

	soup = BeautifulSoup(data, 'html.parser')

	# Remove script and style elements
	for element in soup(['script', 'style', 'head']):
	element.decompose()

	# Get text and clean it
	text = soup.get_text(separator='\n')
	# Remove excessive newlines and whitespace
	text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())

	return text

	except Exception as e:
	logger.error(f"Error extracting text: {str(e)}")
	return {"error": f"Failed to extract text: {str(e)}"}

	def scrape_div(data: str, div: str) -> Union[List[str], Dict[str, str]]:
	"""
	Extract content from specific div elements

	Args:
	data (str): HTML content
	div (str): Class or ID of the div to scrape

	Returns:
	list: List of div contents if successful
	dict: Error information if failed
	"""
	try:
	if not data:
	return {"error": "No HTML content provided"}
	if not div:
	return {"error": "No div selector provided"}

	soup = BeautifulSoup(data, 'html.parser')
	results = []

	# Try class first
	elements = soup.find_all(class_=div)
	if not elements:
	# Try ID if no class found
	elements = soup.find_all(id=div)
	if not elements:
	return {"error": f"No elements found with class or ID: {div}"}

	for element in elements:
	# Get both text and HTML content
	content = {
	"text": element.get_text(strip=True),
	"html": str(element)
	}
	results.append(content)

	return results

	except Exception as e:
	logger.error(f"Error extracting div content: {str(e)}")
	return {"error": f"Failed to extract div content: {str(e)}"}

	# Function to scrape metadata
	def scrape_metadata(data):
	soup = BeautifulSoup(data, 'html.parser')
	metadata = {}
	for meta in soup.find_all('meta'):
	name = meta.get('name') or meta.get('property')
	content = meta.get('content')
	if name and content:
	metadata[name] = content
	return metadata

	# Function to scrape table data
	def scrape_tables(data):
	soup = BeautifulSoup(data, 'html.parser')
	tables = []
	for table in soup.find_all('table'):
	rows = []
	for row in table.find_all('tr'):
	cells = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
	rows.append(cells)
	tables.append(rows)
	return tables