Spaces:

Maouu
/

chipling-api

Sleeping

File size: 763 Bytes

f85ff86

from curl_cffi import requests as req
from bs4 import BeautifulSoup
import html2text

def scrape_to_markdown(url):
    """
    Scrapes a webpage and converts its content to markdown format.
    
    Args:
        url (str): The URL of the webpage to scrape
        
    Returns:
        str: The webpage content converted to markdown
    """
    # Fetch HTML content
    response = req.get(url, impersonate='chrome110')
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Clean up unwanted tags
    for tag in soup(['script', 'style', 'noscript', 'svg', 'css']):
        tag.decompose()
    
    # Extract cleaned HTML
    clean_html = str(soup)
    
    # Convert to Markdown
    markdown = html2text.html2text(clean_html)
    
    return markdown