Spaces:
Sleeping
Sleeping
from curl_cffi import requests as req | |
from bs4 import BeautifulSoup | |
import html2text | |
def scrape_to_markdown(url): | |
""" | |
Scrapes a webpage and converts its content to markdown format. | |
Args: | |
url (str): The URL of the webpage to scrape | |
Returns: | |
str: The webpage content converted to markdown | |
""" | |
# Fetch HTML content | |
response = req.get(url, impersonate='chrome110') | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Clean up unwanted tags | |
for tag in soup(['script', 'style', 'noscript', 'svg', 'css']): | |
tag.decompose() | |
# Extract cleaned HTML | |
clean_html = str(soup) | |
# Convert to Markdown | |
markdown = html2text.html2text(clean_html) | |
return markdown | |