|
import urllib |
|
import re |
|
import copy |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse |
|
from urllib.robotparser import RobotFileParser |
|
from pathlib import Path |
|
import requests |
|
|
|
|
|
root_path = Path.cwd() |
|
html_filter = ['header', 'footer', 'svg', 'img', 'nav', 'script'] |
|
|
|
|
|
def ask_robots(url: str, useragent="*") -> bool: |
|
try: |
|
url_parsed = urlparse(url) |
|
url_robots_txt = url_parsed.scheme + '://' + url_parsed.netloc + '/robots.txt' |
|
print("robots.txt: ", url_robots_txt) |
|
robotParse = urllib.robotparser.RobotFileParser() |
|
robotParse.set_url(url_robots_txt) |
|
robotParse.read() |
|
|
|
print("Ask access to ", url) |
|
return robotParse.can_fetch('*', url) |
|
except Exception as e: |
|
print("Ask Robots :", e) |
|
|
|
|
|
def get_disallowed_urls(url, user_agent="*"): |
|
""" |
|
Gibt alle disallowed URLs für den angegebenen User-Agent aus der robots.txt zurück. |
|
|
|
:param robots_url: Die URL zur robots.txt-Datei (z. B. "https://example.com/robots.txt"). |
|
:param user_agent: Der User-Agent, für den die Regeln geprüft werden (Standard: "*"). |
|
:return: Eine Liste der disallowed URLs. |
|
""" |
|
|
|
url_parsed = urlparse(url) |
|
url_robots_txt = url_parsed.scheme + '://' + url_parsed.netloc + '/robots.txt' |
|
rp = urllib.robotparser.RobotFileParser() |
|
rp.set_url(url_robots_txt) |
|
rp.read() |
|
|
|
|
|
disallowed_paths = [] |
|
|
|
|
|
response = requests.get(url_robots_txt) |
|
if response.status_code == 200: |
|
|
|
lines = response.text.splitlines() |
|
current_user_agent = None |
|
for line in lines: |
|
|
|
line = line.strip() |
|
if not line or line.startswith("#"): |
|
continue |
|
|
|
|
|
if line.lower().startswith("user-agent"): |
|
current_user_agent = line.split(":")[1].strip() |
|
|
|
|
|
elif line.lower().startswith("disallow") and current_user_agent == user_agent: |
|
disallow_path = line.split(":")[1].strip() |
|
if disallow_path: |
|
disallowed_paths.append(disallow_path) |
|
|
|
|
|
base_url = url_robots_txt.rsplit("/", 1)[0] |
|
|
|
|
|
disallowed_urls = [base_url + path for path in disallowed_paths] |
|
return disallowed_urls |
|
|
|
|
|
|
|
def check_regex(url: str, url_patterns: dict) -> bool: |
|
for pattern in url_patterns: |
|
if pattern.match(url): |
|
return False |
|
return True |
|
|
|
|
|
|
|
def check_keywords(content: str, keywords: dict) -> bool: |
|
for word in keywords: |
|
if re.search(word, content, flags=re.IGNORECASE): |
|
return True |
|
return False |
|
|
|
|
|
def get_page_content(soup: BeautifulSoup): |
|
soup_temp = copy.copy(soup) |
|
body = soup_temp.find('body') |
|
if body is None: |
|
return |
|
for tag in html_filter: |
|
for el in body.find_all(tag): |
|
el.decompose() |
|
return prettify_content(body.text) |
|
|
|
|
|
def prettify_content(content: str): |
|
return re.sub(r'\n\s*\n', '\n', content) |
|
|