File size: 3,488 Bytes
da88570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import urllib
import re
import copy
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
from pathlib import Path
import requests
root_path = Path.cwd()
html_filter = ['header', 'footer', 'svg', 'img', 'nav', 'script']
# check if crawler is allowed to crawl url
def ask_robots(url: str, useragent="*") -> bool:
try:
url_parsed = urlparse(url)
url_robots_txt = url_parsed.scheme + '://' + url_parsed.netloc + '/robots.txt'
print("robots.txt: ", url_robots_txt)
robotParse = urllib.robotparser.RobotFileParser()
robotParse.set_url(url_robots_txt)
robotParse.read()
print("Ask access to ", url)
return robotParse.can_fetch('*', url)
except Exception as e:
print("Ask Robots :", e)
def get_disallowed_urls(url, user_agent="*"):
"""
Gibt alle disallowed URLs für den angegebenen User-Agent aus der robots.txt zurück.
:param robots_url: Die URL zur robots.txt-Datei (z. B. "https://example.com/robots.txt").
:param user_agent: Der User-Agent, für den die Regeln geprüft werden (Standard: "*").
:return: Eine Liste der disallowed URLs.
"""
# robots.txt Parser initialisieren
url_parsed = urlparse(url)
url_robots_txt = url_parsed.scheme + '://' + url_parsed.netloc + '/robots.txt'
rp = urllib.robotparser.RobotFileParser()
rp.set_url(url_robots_txt)
rp.read()
# Liste der disallowed Pfade initialisieren
disallowed_paths = []
# robots.txt-Datei als Text herunterladen
response = requests.get(url_robots_txt)
if response.status_code == 200:
# Parsen der robots.txt
lines = response.text.splitlines()
current_user_agent = None
for line in lines:
# Leerzeichen und Kommentare ignorieren
line = line.strip()
if not line or line.startswith("#"):
continue
# User-Agent-Zeilen erkennen
if line.lower().startswith("user-agent"):
current_user_agent = line.split(":")[1].strip()
# Disallow-Regeln erkennen
elif line.lower().startswith("disallow") and current_user_agent == user_agent:
disallow_path = line.split(":")[1].strip()
if disallow_path:
disallowed_paths.append(disallow_path)
# Basis-URL extrahieren
base_url = url_robots_txt.rsplit("/", 1)[0]
# Vollständige URLs zurückgeben
disallowed_urls = [base_url + path for path in disallowed_paths]
return disallowed_urls
# check if the url matches the regEx pattern, exclude it from crawler if it does
def check_regex(url: str, url_patterns: dict) -> bool:
for pattern in url_patterns:
if pattern.match(url):
return False
return True
# exclude url if website contains no keyword
def check_keywords(content: str, keywords: dict) -> bool:
for word in keywords:
if re.search(word, content, flags=re.IGNORECASE):
return True
return False
# get only the content of the page without html tags
def get_page_content(soup: BeautifulSoup):
soup_temp = copy.copy(soup)
body = soup_temp.find('body')
if body is None:
return
for tag in html_filter:
for el in body.find_all(tag):
el.decompose()
return prettify_content(body.text)
def prettify_content(content: str):
return re.sub(r'\n\s*\n', '\n', content)
|