import string import requests from bs4 import BeautifulSoup from newspaper import ( ArticleBinaryDataException, ArticleException, article, ) from src.application.config import MAX_URL_SIZE class URLReader: """ A class to extract content (title, text, images) from a given URL. Supports two extraction methods: newspaper4k and BeautifulSoup. """ def __init__(self, url: string, newspaper: bool = True): """ Initializes the URLReader object. Args: url: The URL to extract content from. newspaper: True to use newspaper4k, False to use BeautifulSoup. """ self.url: str = url self.text: str = None # Extracted text content self.title: str = None # Extracted title self.images: list[str] = None # list of image URLs self.top_image: str = None # URL of the top image self.is_extracted: bool = False # Indicating successful extraction url_size = self.get_size() if url_size is None or url_size > MAX_URL_SIZE: return else: self.is_extracted = True self.newspaper = newspaper if self.newspaper is True: self.extract_content_newspaper() else: self.extract_content_bs() def extract_content_newspaper(self): """ Extracts content from a URL using the newspaper4k library. """ try: response = requests.get(self.url) response.raise_for_status() # Raise HTTPError for bad responses news = article(url=self.url, fetch_images=True) self.title = news.title self.text = news.text self.images = list(set(news.images)) # Remove duplicates self.top_image = news.top_image except requests.exceptions.RequestException as e: print(f"Error fetching URL: {e}") return None except (ArticleException, ArticleBinaryDataException) as e: print(f"\t\t↑↑↑ Error downloading article: {e}") return None def extract_content_bs(self): """ Extracts content from a URL using BeautifulSoup. """ try: response = requests.get(self.url) response.raise_for_status() response.encoding = response.apparent_encoding # Detect encoding soup = BeautifulSoup(response.content, "html.parser") self.title = soup.title.string if soup.title else None image_urls = [img["src"] for img in soup.find_all("img")] self.images = image_urls self.top_image = self.images[0] # Remove unwanted elements from the HTML for element in soup( ["img", "figcaption", "table", "script", "style"], ): element.extract() paragraphs = soup.find_all("p") self.text = " ".join([p.get_text() for p in paragraphs]) except requests.exceptions.RequestException as e: print(f"Error fetching URL: {e}") return None except Exception as e: print(f"Error parsing HTML content from {self.url}: {e}") return None def get_size(self): """ Retrieves the size of a URL's content using a HEAD request. """ try: response = requests.head( self.url, allow_redirects=True, timeout=5, ) response.raise_for_status() # Raise HTTPError for bad responses content_length = response.headers.get("Content-Length") if content_length is not None: return int(content_length) else: print("\t\t↑↑↑ Content-Length header not found") return None except requests.exceptions.RequestException as e: print(f"\t\t↑↑↑ Error getting URL size: {e}") return None if __name__ == "__main__": url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o" reader = URLReader(url) print(f"Title: {reader.title}") print(f"Text: {reader.text}")