Spaces:
Running
Running
File size: 1,605 Bytes
372531f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
from ..utils import get_relevant_images, extract_title
class WebBaseLoaderScraper:
def __init__(self, link, session=None):
self.link = link
self.session = session or requests.Session()
def scrape(self) -> tuple:
"""
This Python function scrapes content from a webpage using a WebBaseLoader object and returns the
concatenated page content.
Returns:
The `scrape` method is returning a string variable named `content` which contains the
concatenated page content from the documents loaded by the `WebBaseLoader`. If an exception
occurs during the process, an error message is printed and an empty string is returned.
"""
try:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(self.link)
loader.requests_kwargs = {"verify": False}
docs = loader.load()
content = ""
for doc in docs:
content += doc.page_content
response = self.session.get(self.link)
soup = BeautifulSoup(response.content, 'html.parser')
image_urls = get_relevant_images(soup, self.link)
# Extract the title using the utility function
title = extract_title(soup)
return content, image_urls, title
except Exception as e:
print("Error! : " + str(e))
return "", [], ""
|