pmkhanh7890's picture
refactor code + fix bug of label after grouping url
00b1038
raw
history blame
4.23 kB
import string
import requests
from bs4 import BeautifulSoup
from newspaper import (
ArticleBinaryDataException,
ArticleException,
article,
)
from src.application.config import MAX_URL_SIZE
class URLReader:
"""
A class to extract content (title, text, images) from a given URL.
Supports two extraction methods: newspaper4k and BeautifulSoup.
"""
def __init__(self, url: string, newspaper: bool = True):
"""
Initializes the URLReader object.
Args:
url: The URL to extract content from.
newspaper: True to use newspaper4k, False to use BeautifulSoup.
"""
self.url: str = url
self.text: str = None # Extracted text content
self.title: str = None # Extracted title
self.images: list[str] = None # list of image URLs
self.top_image: str = None # URL of the top image
self.is_extracted: bool = False # Indicating successful extraction
url_size = self.get_size()
if url_size is None or url_size > MAX_URL_SIZE:
return
else:
self.is_extracted = True
self.newspaper = newspaper
if self.newspaper is True:
self.extract_content_newspaper()
else:
self.extract_content_bs()
def extract_content_newspaper(self):
"""
Extracts content from a URL using the newspaper4k library.
"""
try:
response = requests.get(self.url)
response.raise_for_status() # Raise HTTPError for bad responses
news = article(url=self.url, fetch_images=True)
self.title = news.title
self.text = news.text
self.images = list(set(news.images)) # Remove duplicates
self.top_image = news.top_image
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None
except (ArticleException, ArticleBinaryDataException) as e:
print(f"\t\t↑↑↑ Error downloading article: {e}")
return None
def extract_content_bs(self):
"""
Extracts content from a URL using BeautifulSoup.
"""
try:
response = requests.get(self.url)
response.raise_for_status()
response.encoding = response.apparent_encoding # Detect encoding
soup = BeautifulSoup(response.content, "html.parser")
self.title = soup.title.string if soup.title else None
image_urls = [img["src"] for img in soup.find_all("img")]
self.images = image_urls
self.top_image = self.images[0]
# Remove unwanted elements from the HTML
for element in soup(
["img", "figcaption", "table", "script", "style"],
):
element.extract()
paragraphs = soup.find_all("p")
self.text = " ".join([p.get_text() for p in paragraphs])
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None
except Exception as e:
print(f"Error parsing HTML content from {self.url}: {e}")
return None
def get_size(self):
"""
Retrieves the size of a URL's content using a HEAD request.
"""
try:
response = requests.head(
self.url,
allow_redirects=True,
timeout=5,
)
response.raise_for_status() # Raise HTTPError for bad responses
content_length = response.headers.get("Content-Length")
if content_length is not None:
return int(content_length)
else:
print("\t\t↑↑↑ Content-Length header not found")
return None
except requests.exceptions.RequestException as e:
print(f"\t\t↑↑↑ Error getting URL size: {e}")
return None
if __name__ == "__main__":
url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
reader = URLReader(url)
print(f"Title: {reader.title}")
print(f"Text: {reader.text}")