Spaces:
Sleeping
Sleeping
File size: 4,007 Bytes
22e1b62 38fd181 22e1b62 38fd181 22e1b62 1ce1659 38fd181 22e1b62 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 da7dbd0 22e1b62 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 38fd181 22e1b62 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 a6b0abd 38fd181 a6b0abd 38fd181 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import string
import requests
from bs4 import BeautifulSoup
from newspaper import (
ArticleBinaryDataException,
ArticleException,
article,
)
# TODO: move this to a config file
MAX_URL_SIZE = 2000000 # ~2MB
class URLReader:
def __init__(self, url: string, newspaper: bool = True):
self.url = url
self.text = None # string
self.title = None # string
self.images = None # list of Image objects
self.top_image = None # Image object
self.is_extracted = False
url_size = self.get_size()
if url_size is None or url_size > MAX_URL_SIZE:
return
else:
self.is_extracted = True
self.newspaper = (
newspaper # True if using newspaper4k, False if using BS
)
if self.newspaper is True:
self.extract_content_newspaper()
else:
self.extract_content_bs()
def extract_content_newspaper(self):
"""
Use newspaper4k to extracts content from a URL
Args:
url: The URL of the web page.
Returns:
The extracted content (title, text, images)
"""
try:
response = requests.get(self.url)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None
try:
news = article(url=self.url, fetch_images=True)
except (ArticleException, ArticleBinaryDataException) as e:
print(f"\t\tβββ Error downloading article: {e}")
return None
self.title = news.title
self.text = news.text
self.images = list(set(news.images)) # Remove duplicates
self.top_image = news.top_image
def extract_content_bs(self):
"""
Use BS and process content
"""
response = requests.get(self.url)
response.raise_for_status()
response.encoding = response.apparent_encoding
try:
soup = BeautifulSoup(response.content, "html.parser")
except Exception as e:
print(f"Error parsing HTML content from {self.url}: {e}")
return None
self.title = soup.title.string.strip() if soup.title else None
image_urls = [img["src"] for img in soup.find_all("img")]
self.images = image_urls
self.top_image = self.images[0]
# Exclude text within specific elements
for element in soup(["img", "figcaption", "table", "script", "style"]):
element.extract()
# text = soup.get_text(separator="\n")
paragraphs = soup.find_all("p")
text = " ".join([p.get_text() for p in paragraphs])
self.text = text
def get_size(self):
"""
Retrieves the size of a URL's content using a HEAD request.
Args:
url: The URL to check.
Returns:
The size of the content in bytes,
or None if the size cannot be determined
(e.g., due to network errors or missing Content-Length header).
"""
try:
response = requests.head(
self.url,
allow_redirects=True,
timeout=5,
) # Add timeout
response.raise_for_status() # Raise HTTPError for bad responses
content_length = response.headers.get("Content-Length")
if content_length is not None:
return int(content_length)
else:
print("\t\tβββ Content-Length header not found")
return None
except requests.exceptions.RequestException as e:
print(f"\t\tβββ Error getting URL size: {e}")
return None
if __name__ == "__main__":
url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
reader = URLReader(url)
print(f"Title: {reader.title}")
print(f"Text: {reader.text}")
|