File size: 763 Bytes
f85ff86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from curl_cffi import requests as req
from bs4 import BeautifulSoup
import html2text

def scrape_to_markdown(url):
    """
    Scrapes a webpage and converts its content to markdown format.
    
    Args:
        url (str): The URL of the webpage to scrape
        
    Returns:
        str: The webpage content converted to markdown
    """
    # Fetch HTML content
    response = req.get(url, impersonate='chrome110')
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Clean up unwanted tags
    for tag in soup(['script', 'style', 'noscript', 'svg', 'css']):
        tag.decompose()
    
    # Extract cleaned HTML
    clean_html = str(soup)
    
    # Convert to Markdown
    markdown = html2text.html2text(clean_html)
    
    return markdown