File size: 3,488 Bytes
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import urllib
import re
import copy
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
from pathlib import Path
import requests


root_path = Path.cwd()
html_filter = ['header', 'footer', 'svg', 'img', 'nav', 'script']

# check if crawler is allowed to crawl url
def ask_robots(url: str, useragent="*") -> bool:
    try:
        url_parsed = urlparse(url)
        url_robots_txt = url_parsed.scheme + '://' + url_parsed.netloc + '/robots.txt'
        print("robots.txt: ", url_robots_txt)
        robotParse = urllib.robotparser.RobotFileParser()
        robotParse.set_url(url_robots_txt)
        robotParse.read()

        print("Ask access to ", url)
        return robotParse.can_fetch('*', url)
    except Exception as e:
        print("Ask Robots :", e)


def get_disallowed_urls(url, user_agent="*"):
    """
    Gibt alle disallowed URLs für den angegebenen User-Agent aus der robots.txt zurück.

    :param robots_url: Die URL zur robots.txt-Datei (z. B. "https://example.com/robots.txt").
    :param user_agent: Der User-Agent, für den die Regeln geprüft werden (Standard: "*").
    :return: Eine Liste der disallowed URLs.
    """
    # robots.txt Parser initialisieren
    url_parsed = urlparse(url)
    url_robots_txt = url_parsed.scheme + '://' + url_parsed.netloc + '/robots.txt'
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(url_robots_txt)
    rp.read()

    # Liste der disallowed Pfade initialisieren
    disallowed_paths = []

    # robots.txt-Datei als Text herunterladen
    response = requests.get(url_robots_txt)
    if response.status_code == 200:
        # Parsen der robots.txt
        lines = response.text.splitlines()
        current_user_agent = None
        for line in lines:
            # Leerzeichen und Kommentare ignorieren
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            # User-Agent-Zeilen erkennen
            if line.lower().startswith("user-agent"):
                current_user_agent = line.split(":")[1].strip()

            # Disallow-Regeln erkennen
            elif line.lower().startswith("disallow") and current_user_agent == user_agent:
                disallow_path = line.split(":")[1].strip()
                if disallow_path:
                    disallowed_paths.append(disallow_path)

    # Basis-URL extrahieren
    base_url = url_robots_txt.rsplit("/", 1)[0]

    # Vollständige URLs zurückgeben
    disallowed_urls = [base_url + path for path in disallowed_paths]
    return disallowed_urls


# check if the url matches the regEx pattern, exclude it from crawler if it does
def check_regex(url: str, url_patterns: dict) -> bool:
    for pattern in url_patterns:
        if pattern.match(url):
            return False
    return True


# exclude url if website contains no keyword
def check_keywords(content: str, keywords: dict) -> bool:
    for word in keywords:
        if re.search(word, content, flags=re.IGNORECASE):
            return True
    return False

# get only the content of the page without html tags
def get_page_content(soup: BeautifulSoup):
    soup_temp = copy.copy(soup)
    body = soup_temp.find('body')
    if body is None:
        return
    for tag in html_filter:
        for el in body.find_all(tag):
            el.decompose()
    return prettify_content(body.text)


def prettify_content(content: str):
    return re.sub(r'\n\s*\n', '\n', content)