import string from bs4 import BeautifulSoup from newspaper import article, ArticleException, ArticleBinaryDataException import requests class URLReader(): def __init__(self, url: string, newspaper: bool=True): self.url = url self.text = None # string self.title = None # string self.images = None # list of Image objects self.top_image = None # Image object self.newspaper = newspaper # True if using newspaper4k, False if using BS if self.newspaper is True: self.extract_content_newspaper() else: self.extract_content_bs() def extract_content_newspaper(self): """ Use newspaper4k to extracts content from a URL Args: url: The URL of the web page. Returns: The extracted content (title, text, images) """ try: response = requests.get(self.url) response.raise_for_status() # Raise exception for unsuccessful requests except requests.exceptions.RequestException as e: print(f"Error fetching URL: {e}") return None try: news = article(url=self.url, fetch_images=True) except (ArticleException, ArticleBinaryDataException) as e: print(f"\t\t↑↑↑ Error downloading article: {e}") return None self.title = news.title self.text = news.text self.images = news.images self.top_image = news.top_image def extract_content_bs(self): """ Use BS and process content """ response = requests.get(self.url) response.raise_for_status() response.encoding = response.apparent_encoding try: soup = BeautifulSoup(response.content, "html.parser") except: print(f"Error parsing HTML content from {self.url}") return None self.title = soup.title.string.strip() if soup.title else None image_urls = [img['src'] for img in soup.find_all('img')] self.images = image_urls self.top_image = self.images[0] # Exclude text within specific elements for element in soup(["img", "figcaption", "table", "script", "style"]): element.extract() #text = soup.get_text(separator="\n") paragraphs = soup.find_all('p') text = ' '.join([p.get_text() for p in paragraphs]) self.text = text