Spaces:
Sleeping
Sleeping
File size: 3,825 Bytes
22e1b62 1ce1659 22e1b62 1ce1659 22e1b62 da7dbd0 22e1b62 1ce1659 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import string
from bs4 import BeautifulSoup
from newspaper import article, ArticleException, ArticleBinaryDataException
import requests
# TODO: move this to a config file
MAX_URL_SIZE = 2000000 # ~2MB
class URLReader():
def __init__(self, url: string, newspaper: bool=True):
self.url = url
self.text = None # string
self.title = None # string
self.images = None # list of Image objects
self.top_image = None # Image object
self.is_extracted = False
url_size = self.get_size()
if url_size == None or url_size > MAX_URL_SIZE:
return
else:
self.is_extracted = True
self.newspaper = newspaper # True if using newspaper4k, False if using BS
if self.newspaper is True:
self.extract_content_newspaper()
else:
self.extract_content_bs()
def extract_content_newspaper(self):
"""
Use newspaper4k to extracts content from a URL
Args:
url: The URL of the web page.
Returns:
The extracted content (title, text, images)
"""
try:
response = requests.get(self.url)
response.raise_for_status() # Raise exception for unsuccessful requests
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None
try:
news = article(url=self.url, fetch_images=True)
except (ArticleException, ArticleBinaryDataException) as e:
print(f"\t\tβββ Error downloading article: {e}")
return None
self.title = news.title
self.text = news.text
self.images = list(set(news.images)) # Remove duplicates
self.top_image = news.top_image
def extract_content_bs(self):
"""
Use BS and process content
"""
response = requests.get(self.url)
response.raise_for_status()
response.encoding = response.apparent_encoding
try:
soup = BeautifulSoup(response.content, "html.parser")
except:
print(f"Error parsing HTML content from {self.url}")
return None
self.title = soup.title.string.strip() if soup.title else None
image_urls = [img['src'] for img in soup.find_all('img')]
self.images = image_urls
self.top_image = self.images[0]
# Exclude text within specific elements
for element in soup(["img", "figcaption", "table", "script", "style"]):
element.extract()
#text = soup.get_text(separator="\n")
paragraphs = soup.find_all('p')
text = ' '.join([p.get_text() for p in paragraphs])
self.text = text
def get_size(self):
"""
Retrieves the size of a URL's content using a HEAD request.
Args:
url: The URL to check.
Returns:
The size of the content in bytes, or None if the size cannot be determined
(e.g., due to network errors or missing Content-Length header).
"""
try:
response = requests.head(self.url, allow_redirects=True, timeout=5) # Add timeout
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
content_length = response.headers.get('Content-Length')
if content_length is not None:
return int(content_length)
else:
print(f"\t\tβββ Content-Length header not found")
return None
except requests.exceptions.RequestException as e:
print(f"\t\tβββ Error getting URL size: {e}")
return None |