Spaces:

Maouu
/

flowify-backend

Runtime error

File size: 7,765 Bytes

ce4e319

from curl_cffi import requests as req
from bs4 import BeautifulSoup
import logging
from typing import Union, List, Dict, Optional
from urllib.parse import urljoin, urlparse

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ScrapingError(Exception):
    """Custom exception for scraping errors"""
    pass

def validate_url(url: str) -> bool:
    """Validate if the given URL is properly formatted"""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except Exception:
        return False

def clean_url(url: str) -> str:
    """Clean and normalize URL"""
    if url.startswith('//'):
        return f'https:{url}'
    return url

def scrape_html(url: str) -> Union[str, Dict[str, str]]:
    """
    Fetch HTML content from a URL with improved error handling
    
    Args:
        url (str): The URL to scrape
        
    Returns:
        str: HTML content if successful
        dict: Error information if failed
    """
    try:
        if not validate_url(url):
            return {"error": "Invalid URL format"}

        response = req.get(
            url, 
            impersonate='chrome110',
            timeout=30,
            max_redirects=5
        )
        
        # Check if response is HTML
        content_type = response.headers.get('content-type', '').lower()
        if 'text/html' not in content_type:
            return {"error": f"Unexpected content type: {content_type}"}

        return response.text
    
    except Exception as e:
        logger.error(f"Unexpected error while scraping {url}: {str(e)}")
        return {"error": f"Unexpected error: {str(e)}"}

def scrape_images(data: str, filter: str = "") -> Union[List[str], Dict[str, str]]:
    """
    Extract image URLs from HTML content with improved filtering and validation
    
    Args:
        data (str): HTML content
        filter (str): Optional filter string for URLs
        
    Returns:
        list: List of image URLs if successful
        dict: Error information if failed
    """
    try:
        if not data:
            return {"error": "No HTML content provided"}

        soup = BeautifulSoup(data, 'html.parser')
        images = []
        
        # Look for both img tags and background images in style attributes
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if src:
                src = clean_url(src)
                if validate_url(src) and (not filter or filter.lower() in src.lower()):
                    images.append(src)

        # Look for background images in style attributes
        for elem in soup.find_all(style=True):
            style = elem['style']
            if 'background-image' in style:
                url_start = style.find('url(') + 4
                url_end = style.find(')', url_start)
                if url_start > 4 and url_end != -1:
                    src = style[url_start:url_end].strip('"\'')
                    src = clean_url(src)
                    if validate_url(src) and (not filter or filter.lower() in src.lower()):
                        images.append(src)

        return list(set(images))  # Remove duplicates

    except Exception as e:
        logger.error(f"Error extracting images: {str(e)}")
        return {"error": f"Failed to extract images: {str(e)}"}

def scrape_links(url: str, filter: str = "") -> Union[List[str], Dict[str, str]]:
    """
    Extract links from a webpage with improved validation and error handling
    
    Args:
        url (str): URL to scrape
        filter (str): Optional filter for links
        
    Returns:
        list: List of links if successful
        dict: Error information if failed
    """
    try:
        if not validate_url(url):
            return {"error": "Invalid URL format"}


        print(url)
        response = req.get(url, impersonate='chrome110')
        
        soup = BeautifulSoup(response.text, 'html.parser')
        links = []
        base_url = url

        try:

            for a in soup.find_all('a', href=True):
                href = a['href']
                # Convert relative URLs to absolute
                full_url = urljoin(base_url, href)
                
                if validate_url(full_url) and (not filter or filter.lower() in full_url.lower()):
                    links.append(full_url)

            return list(set(links))  # Remove duplicates
        
        except Exception as e:
            logger.error(f"Error processing links: {str(e)}")
            return {"error": f"Failed to process links: {str(e)}"}

    except Exception as e:
        logger.error(f"Error extracting links: {str(e)}")
        return {"error": f"Failed to extract links: {str(e)}"}

def scrape_text(data: str) -> Union[str, Dict[str, str]]:
    """
    Extract clean text content from HTML
    
    Args:
        data (str): HTML content
        
    Returns:
        str: Extracted text if successful
        dict: Error information if failed
    """
    try:
        if not data:
            return {"error": "No HTML content provided"}

        soup = BeautifulSoup(data, 'html.parser')
        
        # Remove script and style elements
        for element in soup(['script', 'style', 'head']):
            element.decompose()
        
        # Get text and clean it
        text = soup.get_text(separator='\n')
        # Remove excessive newlines and whitespace
        text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
        
        return text

    except Exception as e:
        logger.error(f"Error extracting text: {str(e)}")
        return {"error": f"Failed to extract text: {str(e)}"}

def scrape_div(data: str, div: str) -> Union[List[str], Dict[str, str]]:
    """
    Extract content from specific div elements
    
    Args:
        data (str): HTML content
        div (str): Class or ID of the div to scrape
        
    Returns:
        list: List of div contents if successful
        dict: Error information if failed
    """
    try:
        if not data:
            return {"error": "No HTML content provided"}
        if not div:
            return {"error": "No div selector provided"}

        soup = BeautifulSoup(data, 'html.parser')
        results = []

        # Try class first
        elements = soup.find_all(class_=div)
        if not elements:
            # Try ID if no class found
            elements = soup.find_all(id=div)
            if not elements:
                return {"error": f"No elements found with class or ID: {div}"}

        for element in elements:
            # Get both text and HTML content
            content = {
                "text": element.get_text(strip=True),
                "html": str(element)
            }
            results.append(content)

        return results

    except Exception as e:
        logger.error(f"Error extracting div content: {str(e)}")
        return {"error": f"Failed to extract div content: {str(e)}"}

# Function to scrape metadata
def scrape_metadata(data):
    soup = BeautifulSoup(data, 'html.parser')
    metadata = {}
    for meta in soup.find_all('meta'):
        name = meta.get('name') or meta.get('property')
        content = meta.get('content')
        if name and content:
            metadata[name] = content
    return metadata

# Function to scrape table data
def scrape_tables(data):
    soup = BeautifulSoup(data, 'html.parser')
    tables = []
    for table in soup.find_all('table'):
        rows = []
        for row in table.find_all('tr'):
            cells = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
            rows.append(cells)
        tables.append(rows)
    return tables