File size: 3,825 Bytes
22e1b62
 
 
 
 
1ce1659
 
22e1b62
 
 
 
 
 
 
1ce1659
 
 
 
 
 
 
 
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da7dbd0
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ce1659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import string
from bs4 import BeautifulSoup
from newspaper import article, ArticleException, ArticleBinaryDataException
import requests

# TODO: move this to a config file
MAX_URL_SIZE = 2000000 # ~2MB
class URLReader():
    def __init__(self, url: string, newspaper: bool=True):
        self.url = url
        self.text = None  # string
        self.title = None  # string
        self.images = None  # list of Image objects
        self.top_image = None  # Image object
        self.is_extracted = False
        
        url_size = self.get_size()
        if url_size == None or url_size > MAX_URL_SIZE:
            return
        else: 
            self.is_extracted = True
        
        self.newspaper = newspaper  # True if using newspaper4k, False if using BS
        if self.newspaper is True:
            self.extract_content_newspaper()
        else:
            self.extract_content_bs()
        
    def extract_content_newspaper(self):
        """
        Use newspaper4k to extracts content from a URL

        Args:
            url: The URL of the web page.

        Returns:
            The extracted content (title, text, images)
        """
        
        try:
            response = requests.get(self.url)
            response.raise_for_status()  # Raise exception for unsuccessful requests
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None
        
        try:
            news = article(url=self.url, fetch_images=True)
        except (ArticleException, ArticleBinaryDataException) as e:
            print(f"\t\t↑↑↑ Error downloading article: {e}")
            return None
        
        self.title = news.title
        self.text = news.text
        self.images = list(set(news.images))  # Remove duplicates
        self.top_image = news.top_image

    def extract_content_bs(self):
        """
        Use BS and process content
        """
        response = requests.get(self.url)
        response.raise_for_status()
        
        response.encoding = response.apparent_encoding
        
        try:
            soup = BeautifulSoup(response.content, "html.parser")
        except:
            print(f"Error parsing HTML content from {self.url}")
            return None
        
        self.title = soup.title.string.strip() if soup.title else None
        
        image_urls = [img['src'] for img in soup.find_all('img')] 
        self.images = image_urls
        self.top_image = self.images[0]
        
        # Exclude text within specific elements
        for element in soup(["img", "figcaption", "table", "script", "style"]):
            element.extract()
        #text = soup.get_text(separator="\n")
        paragraphs = soup.find_all('p')
        text = ' '.join([p.get_text() for p in paragraphs])

        self.text = text
        
    def get_size(self):
        """
        Retrieves the size of a URL's content using a HEAD request.

        Args:
            url: The URL to check.

        Returns:
            The size of the content in bytes, or None if the size cannot be determined
            (e.g., due to network errors or missing Content-Length header).
        """
        try:
            response = requests.head(self.url, allow_redirects=True, timeout=5) # Add timeout
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

            content_length = response.headers.get('Content-Length')
            if content_length is not None:
                return int(content_length)
            else:
                print(f"\t\t↑↑↑ Content-Length header not found")
                return None

        except requests.exceptions.RequestException as e:
            print(f"\t\t↑↑↑ Error getting URL size: {e}")
        return None