File size: 4,613 Bytes
da88570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import re
# define patterns to filter urls that should not be crawled
PATTERNS = [
re.compile(r'.*about us*.*', re.IGNORECASE),
re.compile(r'.*about*.*us.*', re.IGNORECASE),
re.compile(r'.*a(b|n)meld(en|ung)*.*', re.IGNORECASE),
re.compile(r'.*about.*', re.IGNORECASE),
re.compile(r'.*agb*.*', re.IGNORECASE),
re.compile(r'.*archiv*.*', re.IGNORECASE),
re.compile(r'.*aussteller*.*', re.IGNORECASE),
re.compile(r'.*auszeichnung*.*', re.IGNORECASE),
re.compile(r'.*barrierefrei*.*', re.IGNORECASE),
re.compile(r'.*bestellverfolgung*.*', re.IGNORECASE),
re.compile(r'.*bezahlung*.*',re.IGNORECASE),
re.compile(r'.*bilder*.*', re.IGNORECASE),
re.compile(r'.*cart*.*', re.IGNORECASE),
re.compile(r'.*checkout*.*', re.IGNORECASE),
re.compile(r'.*contact*.*', re.IGNORECASE),
re.compile(r'.*credit card*.*', re.IGNORECASE),
re.compile(r'^https?://(?:www\.)?.*\.(jpg|png|pdf)$'),
re.compile(r'.*datenschutz*.*', re.IGNORECASE),
re.compile(r'.*debit*.*', re.IGNORECASE),
re.compile(r'.*download*.*', re.IGNORECASE),
re.compile(r'.*dsgvo.*', re.IGNORECASE),
re.compile(r'.*faq*.*', re.IGNORECASE),
re.compile(r'.*firmen(feiern|veranstaltungen).*', re.IGNORECASE),
re.compile(r'.*f(oe|ö)rder.*', re.IGNORECASE),
re.compile(r'.*for-rent*.*', re.IGNORECASE),
re.compile(r'.*fotos*.*', re.IGNORECASE),
re.compile(r'.*g(ae|ä)steliste*.*', re.IGNORECASE),
re.compile(r'.*galerie*.*', re.IGNORECASE),
re.compile(r'.*gallery*.*', re.IGNORECASE),
re.compile(r'.*geschaeftsbedingungen*.*', re.IGNORECASE),
re.compile(r'.*geschäftsbedingungen*.*', re.IGNORECASE),
re.compile(r'.*giropay*.*', re.IGNORECASE),
re.compile(r'.*guestlist*.*', re.IGNORECASE),
re.compile(r'.*hinweise*.*', re.IGNORECASE),
re.compile(r'.*impressum*.*', re.IGNORECASE),
re.compile(r'.*info.*', re.IGNORECASE),
re.compile(r'.*jobs*.*', re.IGNORECASE),
re.compile(r'.*karriere*.*', re.IGNORECASE),
re.compile(r'.*kasse*.*', re.IGNORECASE),
re.compile(r'.*klarna*.*', re.IGNORECASE),
re.compile(r'.*kontakt*.*', re.IGNORECASE),
re.compile(r'.*konto*.*', re.IGNORECASE),
re.compile(r'.*kreditkarte*.*', re.IGNORECASE),
re.compile(r'.*lastschrift*.*', re.IGNORECASE),
re.compile(r'.*landschaftsprogramm*.*', re.IGNORECASE),
re.compile(r'.*lieferung*.*', re.IGNORECASE),
re.compile(r'.*location.*', re.IGNORECASE),
re.compile(r'.*login*.*', re.IGNORECASE),
re.compile(r'.*mein-konto*.*', re.IGNORECASE),
re.compile(r'.*meine-bestellungen*.*', re.IGNORECASE),
re.compile(r'.*merch*.*', re.IGNORECASE),
re.compile(r'.*merkzettel*.*', re.IGNORECASE),
re.compile(r'.*mieten*.*', re.IGNORECASE),
re.compile(r'.*account*.*', re.IGNORECASE),
re.compile(r'.*orders*.*', re.IGNORECASE),
re.compile(r'.*news*.*', re.IGNORECASE),
re.compile(r'.*newsletter*.*', re.IGNORECASE),
re.compile(r'.*pay*.*', re.IGNORECASE),
re.compile(r'.*payment*.*', re.IGNORECASE),
re.compile(r'.*paypal*.*', re.IGNORECASE),
re.compile(r'.*personal*.*', re.IGNORECASE),
re.compile(r'.*photos*.*', re.IGNORECASE),
re.compile(r'.*pics*.*', re.IGNORECASE),
re.compile(r'.*pictures*.*', re.IGNORECASE),
re.compile(r'.*policy*.*', re.IGNORECASE),
re.compile(r'.*portfolio*.*', re.IGNORECASE),
re.compile(r'.*press*.*', re.IGNORECASE),
re.compile(r'.*pressemeldung*.*', re.IGNORECASE),
re.compile(r'.*pressemitteilungen*.*', re.IGNORECASE),
re.compile(r'.*privacy-policy*.*', re.IGNORECASE),
re.compile(r'.*produkt*.*', re.IGNORECASE),
re.compile(r'.*rathaus*.*', re.IGNORECASE),
re.compile(r'.*rechnung*.*', re.IGNORECASE),
re.compile(r'.*sepa*.*', re.IGNORECASE),
re.compile(r'.*shop*.*', re.IGNORECASE),
re.compile(r'.*signup*.*', re.IGNORECASE),
re.compile(r'.*sofort*.*', re.IGNORECASE),
re.compile(r'.*support*.*', re.IGNORECASE),
re.compile(r'.*terms-of-use*.*', re.IGNORECASE),
re.compile(r'.*twitter|facebook|instagram|tiktok*.*', re.IGNORECASE),
re.compile(r'.*(ue|ü)ber.?uns*.*', re.IGNORECASE),
re.compile(r'.*unterricht*.*', re.IGNORECASE),
re.compile(r'.*versand*.*', re.IGNORECASE),
re.compile(r'.*verbraucherschutz*.*', re.IGNORECASE),
re.compile(r'.*warenkorb*.*', re.IGNORECASE),
re.compile(r'.*wegbeschreibung*.*', re.IGNORECASE),
re.compile(r'.*widerrufsbelehrung*.*', re.IGNORECASE),
re.compile(r'.*wishlist*.*', re.IGNORECASE),
re.compile(r'.*zahlung*.*', re.IGNORECASE),
re.compile(r'.*.jpeg', re.IGNORECASE),
]
|