File size: 4,025 Bytes
22e1b62
 
 
 
 
1ce1659
 
22e1b62
 
 
 
 
 
 
1ce1659
 
 
 
 
 
 
 
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da7dbd0
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ce1659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6b0abd
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import string
from bs4 import BeautifulSoup
from newspaper import article, ArticleException, ArticleBinaryDataException
import requests

# TODO: move this to a config file
MAX_URL_SIZE = 2000000 # ~2MB
class URLReader():
    def __init__(self, url: string, newspaper: bool=True):
        self.url = url
        self.text = None  # string
        self.title = None  # string
        self.images = None  # list of Image objects
        self.top_image = None  # Image object
        self.is_extracted = False
        
        url_size = self.get_size()
        if url_size == None or url_size > MAX_URL_SIZE:
            return
        else: 
            self.is_extracted = True
        
        self.newspaper = newspaper  # True if using newspaper4k, False if using BS
        if self.newspaper is True:
            self.extract_content_newspaper()
        else:
            self.extract_content_bs()
        
    def extract_content_newspaper(self):
        """
        Use newspaper4k to extracts content from a URL

        Args:
            url: The URL of the web page.

        Returns:
            The extracted content (title, text, images)
        """
        
        try:
            response = requests.get(self.url)
            response.raise_for_status()  # Raise exception for unsuccessful requests
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None
        
        try:
            news = article(url=self.url, fetch_images=True)
        except (ArticleException, ArticleBinaryDataException) as e:
            print(f"\t\t↑↑↑ Error downloading article: {e}")
            return None
        
        self.title = news.title
        self.text = news.text
        self.images = list(set(news.images))  # Remove duplicates
        self.top_image = news.top_image

    def extract_content_bs(self):
        """
        Use BS and process content
        """
        response = requests.get(self.url)
        response.raise_for_status()
        
        response.encoding = response.apparent_encoding
        
        try:
            soup = BeautifulSoup(response.content, "html.parser")
        except:
            print(f"Error parsing HTML content from {self.url}")
            return None
        
        self.title = soup.title.string.strip() if soup.title else None
        
        image_urls = [img['src'] for img in soup.find_all('img')] 
        self.images = image_urls
        self.top_image = self.images[0]
        
        # Exclude text within specific elements
        for element in soup(["img", "figcaption", "table", "script", "style"]):
            element.extract()
        #text = soup.get_text(separator="\n")
        paragraphs = soup.find_all('p')
        text = ' '.join([p.get_text() for p in paragraphs])

        self.text = text
        
    def get_size(self):
        """
        Retrieves the size of a URL's content using a HEAD request.

        Args:
            url: The URL to check.

        Returns:
            The size of the content in bytes, or None if the size cannot be determined
            (e.g., due to network errors or missing Content-Length header).
        """
        try:
            response = requests.head(self.url, allow_redirects=True, timeout=5) # Add timeout
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

            content_length = response.headers.get('Content-Length')
            if content_length is not None:
                return int(content_length)
            else:
                print(f"\t\t↑↑↑ Content-Length header not found")
                return None

        except requests.exceptions.RequestException as e:
            print(f"\t\t↑↑↑ Error getting URL size: {e}")
        return None
    

if __name__ == '__main__':
    url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
    reader = URLReader(url)
    print(f"Title: {reader.title}")
    print(f"Text: {reader.text}")