import string

import requests
from bs4 import BeautifulSoup
from newspaper import (
    ArticleBinaryDataException,
    ArticleException,
    article,
)

from src.application.config import MAX_URL_SIZE


class URLReader:
    """
    A class to extract content (title, text, images) from a given URL.
    Supports two extraction methods: newspaper4k and BeautifulSoup.
    """

    def __init__(self, url: string, newspaper: bool = True):
        """
        Initializes the URLReader object.

        Args:
            url: The URL to extract content from.
            newspaper: True to use newspaper4k, False to use BeautifulSoup.
        """
        self.url: str = url
        self.text: str = None  # Extracted text content
        self.title: str = None  # Extracted title
        self.images: list[str] = None  # list of image URLs
        self.top_image: str = None  # URL of the top image
        self.is_extracted: bool = False  # Indicating successful extraction

        url_size = self.get_size()
        if url_size is None or url_size > MAX_URL_SIZE:
            return
        else:
            self.is_extracted = True

        self.newspaper = newspaper
        if self.newspaper is True:
            self.extract_content_newspaper()
        else:
            self.extract_content_bs()

    def extract_content_newspaper(self):
        """
        Extracts content from a URL using the newspaper4k library.
        """
        try:
            response = requests.get(self.url)
            response.raise_for_status()  # Raise HTTPError for bad responses

            news = article(url=self.url, fetch_images=True)

            self.title = news.title
            self.text = news.text
            self.images = list(set(news.images))  # Remove duplicates
            self.top_image = news.top_image

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None
        except (ArticleException, ArticleBinaryDataException) as e:
            print(f"\t\t↑↑↑ Error downloading article: {e}")
            return None

    def extract_content_bs(self):
        """
        Extracts content from a URL using BeautifulSoup.
        """
        try:
            response = requests.get(self.url)
            response.raise_for_status()

            response.encoding = response.apparent_encoding  # Detect encoding

            soup = BeautifulSoup(response.content, "html.parser")

            self.title = soup.title.string if soup.title else None

            image_urls = [img["src"] for img in soup.find_all("img")]
            self.images = image_urls
            self.top_image = self.images[0]

            # Remove unwanted elements from the HTML
            for element in soup(
                ["img", "figcaption", "table", "script", "style"],
            ):
                element.extract()

            paragraphs = soup.find_all("p")
            self.text = " ".join([p.get_text() for p in paragraphs])

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None
        except Exception as e:
            print(f"Error parsing HTML content from {self.url}: {e}")
            return None

    def get_size(self):
        """
        Retrieves the size of a URL's content using a HEAD request.
        """
        try:
            response = requests.head(
                self.url,
                allow_redirects=True,
                timeout=5,
            )
            response.raise_for_status()  # Raise HTTPError for bad responses

            content_length = response.headers.get("Content-Length")
            if content_length is not None:
                return int(content_length)
            else:
                print("\t\t↑↑↑ Content-Length header not found")
                return None

        except requests.exceptions.RequestException as e:
            print(f"\t\t↑↑↑ Error getting URL size: {e}")
            return None


if __name__ == "__main__":
    url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
    reader = URLReader(url)
    print(f"Title: {reader.title}")
    print(f"Text: {reader.text}")