Spaces:
Sleeping
Sleeping
File size: 763 Bytes
f85ff86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from curl_cffi import requests as req
from bs4 import BeautifulSoup
import html2text
def scrape_to_markdown(url):
"""
Scrapes a webpage and converts its content to markdown format.
Args:
url (str): The URL of the webpage to scrape
Returns:
str: The webpage content converted to markdown
"""
# Fetch HTML content
response = req.get(url, impersonate='chrome110')
soup = BeautifulSoup(response.text, 'html.parser')
# Clean up unwanted tags
for tag in soup(['script', 'style', 'noscript', 'svg', 'css']):
tag.decompose()
# Extract cleaned HTML
clean_html = str(soup)
# Convert to Markdown
markdown = html2text.html2text(clean_html)
return markdown
|