Spaces:
Sleeping
Sleeping
import string | |
import requests | |
from bs4 import BeautifulSoup | |
from newspaper import ( | |
ArticleBinaryDataException, | |
ArticleException, | |
article, | |
) | |
# TODO: move this to a config file | |
MAX_URL_SIZE = 2000000 # ~2MB | |
class URLReader: | |
def __init__(self, url: string, newspaper: bool = True): | |
self.url = url | |
self.text = None # string | |
self.title = None # string | |
self.images = None # list of Image objects | |
self.top_image = None # Image object | |
self.is_extracted = False | |
url_size = self.get_size() | |
if url_size is None or url_size > MAX_URL_SIZE: | |
return | |
else: | |
self.is_extracted = True | |
self.newspaper = ( | |
newspaper # True if using newspaper4k, False if using BS | |
) | |
if self.newspaper is True: | |
self.extract_content_newspaper() | |
else: | |
self.extract_content_bs() | |
def extract_content_newspaper(self): | |
""" | |
Use newspaper4k to extracts content from a URL | |
Args: | |
url: The URL of the web page. | |
Returns: | |
The extracted content (title, text, images) | |
""" | |
try: | |
response = requests.get(self.url) | |
response.raise_for_status() | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching URL: {e}") | |
return None | |
try: | |
news = article(url=self.url, fetch_images=True) | |
except (ArticleException, ArticleBinaryDataException) as e: | |
print(f"\t\tβββ Error downloading article: {e}") | |
return None | |
self.title = news.title | |
self.text = news.text | |
self.images = list(set(news.images)) # Remove duplicates | |
self.top_image = news.top_image | |
def extract_content_bs(self): | |
""" | |
Use BS and process content | |
""" | |
response = requests.get(self.url) | |
response.raise_for_status() | |
response.encoding = response.apparent_encoding | |
try: | |
soup = BeautifulSoup(response.content, "html.parser") | |
except Exception as e: | |
print(f"Error parsing HTML content from {self.url}: {e}") | |
return None | |
self.title = soup.title.string.strip() if soup.title else None | |
image_urls = [img["src"] for img in soup.find_all("img")] | |
self.images = image_urls | |
self.top_image = self.images[0] | |
# Exclude text within specific elements | |
for element in soup(["img", "figcaption", "table", "script", "style"]): | |
element.extract() | |
# text = soup.get_text(separator="\n") | |
paragraphs = soup.find_all("p") | |
text = " ".join([p.get_text() for p in paragraphs]) | |
self.text = text | |
def get_size(self): | |
""" | |
Retrieves the size of a URL's content using a HEAD request. | |
Args: | |
url: The URL to check. | |
Returns: | |
The size of the content in bytes, | |
or None if the size cannot be determined | |
(e.g., due to network errors or missing Content-Length header). | |
""" | |
try: | |
response = requests.head( | |
self.url, | |
allow_redirects=True, | |
timeout=5, | |
) # Add timeout | |
response.raise_for_status() # Raise HTTPError for bad responses | |
content_length = response.headers.get("Content-Length") | |
if content_length is not None: | |
return int(content_length) | |
else: | |
print("\t\tβββ Content-Length header not found") | |
return None | |
except requests.exceptions.RequestException as e: | |
print(f"\t\tβββ Error getting URL size: {e}") | |
return None | |
if __name__ == "__main__": | |
url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o" | |
reader = URLReader(url) | |
print(f"Title: {reader.title}") | |
print(f"Text: {reader.text}") | |