File size: 4,007 Bytes
22e1b62
38fd181
22e1b62
38fd181
 
 
 
 
 
22e1b62
1ce1659
38fd181
 
 
 
 
22e1b62
 
 
 
 
1ce1659
38fd181
1ce1659
38fd181
1ce1659
38fd181
1ce1659
38fd181
 
 
 
22e1b62
 
 
 
38fd181
22e1b62
 
 
 
 
 
 
 
 
 
38fd181
22e1b62
 
38fd181
22e1b62
 
 
38fd181
22e1b62
 
 
 
 
38fd181
22e1b62
 
da7dbd0
22e1b62
 
 
 
 
 
 
 
38fd181
22e1b62
38fd181
22e1b62
 
38fd181
 
22e1b62
38fd181
22e1b62
38fd181
 
22e1b62
 
38fd181
22e1b62
 
 
38fd181
 
 
22e1b62
1ce1659
38fd181
1ce1659
 
 
 
 
 
 
 
38fd181
 
1ce1659
 
 
38fd181
 
 
 
 
 
1ce1659
38fd181
1ce1659
 
 
38fd181
1ce1659
 
 
 
a6b0abd
 
38fd181
 
a6b0abd
 
 
38fd181
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import string

import requests
from bs4 import BeautifulSoup
from newspaper import (
    ArticleBinaryDataException,
    ArticleException,
    article,
)

# TODO: move this to a config file
MAX_URL_SIZE = 2000000  # ~2MB


class URLReader:
    def __init__(self, url: string, newspaper: bool = True):
        self.url = url
        self.text = None  # string
        self.title = None  # string
        self.images = None  # list of Image objects
        self.top_image = None  # Image object
        self.is_extracted = False

        url_size = self.get_size()
        if url_size is None or url_size > MAX_URL_SIZE:
            return
        else:
            self.is_extracted = True

        self.newspaper = (
            newspaper  # True if using newspaper4k, False if using BS
        )
        if self.newspaper is True:
            self.extract_content_newspaper()
        else:
            self.extract_content_bs()

    def extract_content_newspaper(self):
        """
        Use newspaper4k to extracts content from a URL

        Args:
            url: The URL of the web page.

        Returns:
            The extracted content (title, text, images)
        """

        try:
            response = requests.get(self.url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None

        try:
            news = article(url=self.url, fetch_images=True)
        except (ArticleException, ArticleBinaryDataException) as e:
            print(f"\t\t↑↑↑ Error downloading article: {e}")
            return None

        self.title = news.title
        self.text = news.text
        self.images = list(set(news.images))  # Remove duplicates
        self.top_image = news.top_image

    def extract_content_bs(self):
        """
        Use BS and process content
        """
        response = requests.get(self.url)
        response.raise_for_status()

        response.encoding = response.apparent_encoding

        try:
            soup = BeautifulSoup(response.content, "html.parser")
        except Exception as e:
            print(f"Error parsing HTML content from {self.url}: {e}")
            return None

        self.title = soup.title.string.strip() if soup.title else None

        image_urls = [img["src"] for img in soup.find_all("img")]
        self.images = image_urls
        self.top_image = self.images[0]

        # Exclude text within specific elements
        for element in soup(["img", "figcaption", "table", "script", "style"]):
            element.extract()
        # text = soup.get_text(separator="\n")
        paragraphs = soup.find_all("p")
        text = " ".join([p.get_text() for p in paragraphs])

        self.text = text

    def get_size(self):
        """
        Retrieves the size of a URL's content using a HEAD request.

        Args:
            url: The URL to check.

        Returns:
            The size of the content in bytes,
            or None if the size cannot be determined
            (e.g., due to network errors or missing Content-Length header).
        """
        try:
            response = requests.head(
                self.url,
                allow_redirects=True,
                timeout=5,
            )  # Add timeout
            response.raise_for_status()  # Raise HTTPError for bad responses

            content_length = response.headers.get("Content-Length")
            if content_length is not None:
                return int(content_length)
            else:
                print("\t\t↑↑↑ Content-Length header not found")
                return None

        except requests.exceptions.RequestException as e:
            print(f"\t\t↑↑↑ Error getting URL size: {e}")
        return None


if __name__ == "__main__":
    url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
    reader = URLReader(url)
    print(f"Title: {reader.title}")
    print(f"Text: {reader.text}")