import re # define patterns to filter urls that should not be crawled PATTERNS = [ re.compile(r'.*about us*.*', re.IGNORECASE), re.compile(r'.*about*.*us.*', re.IGNORECASE), re.compile(r'.*a(b|n)meld(en|ung)*.*', re.IGNORECASE), re.compile(r'.*about.*', re.IGNORECASE), re.compile(r'.*agb*.*', re.IGNORECASE), re.compile(r'.*archiv*.*', re.IGNORECASE), re.compile(r'.*aussteller*.*', re.IGNORECASE), re.compile(r'.*auszeichnung*.*', re.IGNORECASE), re.compile(r'.*barrierefrei*.*', re.IGNORECASE), re.compile(r'.*bestellverfolgung*.*', re.IGNORECASE), re.compile(r'.*bezahlung*.*',re.IGNORECASE), re.compile(r'.*bilder*.*', re.IGNORECASE), re.compile(r'.*cart*.*', re.IGNORECASE), re.compile(r'.*checkout*.*', re.IGNORECASE), re.compile(r'.*contact*.*', re.IGNORECASE), re.compile(r'.*credit card*.*', re.IGNORECASE), re.compile(r'^https?://(?:www\.)?.*\.(jpg|png|pdf)$'), re.compile(r'.*datenschutz*.*', re.IGNORECASE), re.compile(r'.*debit*.*', re.IGNORECASE), re.compile(r'.*download*.*', re.IGNORECASE), re.compile(r'.*dsgvo.*', re.IGNORECASE), re.compile(r'.*faq*.*', re.IGNORECASE), re.compile(r'.*firmen(feiern|veranstaltungen).*', re.IGNORECASE), re.compile(r'.*f(oe|ö)rder.*', re.IGNORECASE), re.compile(r'.*for-rent*.*', re.IGNORECASE), re.compile(r'.*fotos*.*', re.IGNORECASE), re.compile(r'.*g(ae|ä)steliste*.*', re.IGNORECASE), re.compile(r'.*galerie*.*', re.IGNORECASE), re.compile(r'.*gallery*.*', re.IGNORECASE), re.compile(r'.*geschaeftsbedingungen*.*', re.IGNORECASE), re.compile(r'.*geschäftsbedingungen*.*', re.IGNORECASE), re.compile(r'.*giropay*.*', re.IGNORECASE), re.compile(r'.*guestlist*.*', re.IGNORECASE), re.compile(r'.*hinweise*.*', re.IGNORECASE), re.compile(r'.*impressum*.*', re.IGNORECASE), re.compile(r'.*info.*', re.IGNORECASE), re.compile(r'.*jobs*.*', re.IGNORECASE), re.compile(r'.*karriere*.*', re.IGNORECASE), re.compile(r'.*kasse*.*', re.IGNORECASE), re.compile(r'.*klarna*.*', re.IGNORECASE), re.compile(r'.*kontakt*.*', re.IGNORECASE), re.compile(r'.*konto*.*', re.IGNORECASE), re.compile(r'.*kreditkarte*.*', re.IGNORECASE), re.compile(r'.*lastschrift*.*', re.IGNORECASE), re.compile(r'.*landschaftsprogramm*.*', re.IGNORECASE), re.compile(r'.*lieferung*.*', re.IGNORECASE), re.compile(r'.*location.*', re.IGNORECASE), re.compile(r'.*login*.*', re.IGNORECASE), re.compile(r'.*mein-konto*.*', re.IGNORECASE), re.compile(r'.*meine-bestellungen*.*', re.IGNORECASE), re.compile(r'.*merch*.*', re.IGNORECASE), re.compile(r'.*merkzettel*.*', re.IGNORECASE), re.compile(r'.*mieten*.*', re.IGNORECASE), re.compile(r'.*account*.*', re.IGNORECASE), re.compile(r'.*orders*.*', re.IGNORECASE), re.compile(r'.*news*.*', re.IGNORECASE), re.compile(r'.*newsletter*.*', re.IGNORECASE), re.compile(r'.*pay*.*', re.IGNORECASE), re.compile(r'.*payment*.*', re.IGNORECASE), re.compile(r'.*paypal*.*', re.IGNORECASE), re.compile(r'.*personal*.*', re.IGNORECASE), re.compile(r'.*photos*.*', re.IGNORECASE), re.compile(r'.*pics*.*', re.IGNORECASE), re.compile(r'.*pictures*.*', re.IGNORECASE), re.compile(r'.*policy*.*', re.IGNORECASE), re.compile(r'.*portfolio*.*', re.IGNORECASE), re.compile(r'.*press*.*', re.IGNORECASE), re.compile(r'.*pressemeldung*.*', re.IGNORECASE), re.compile(r'.*pressemitteilungen*.*', re.IGNORECASE), re.compile(r'.*privacy-policy*.*', re.IGNORECASE), re.compile(r'.*produkt*.*', re.IGNORECASE), re.compile(r'.*rathaus*.*', re.IGNORECASE), re.compile(r'.*rechnung*.*', re.IGNORECASE), re.compile(r'.*sepa*.*', re.IGNORECASE), re.compile(r'.*shop*.*', re.IGNORECASE), re.compile(r'.*signup*.*', re.IGNORECASE), re.compile(r'.*sofort*.*', re.IGNORECASE), re.compile(r'.*support*.*', re.IGNORECASE), re.compile(r'.*terms-of-use*.*', re.IGNORECASE), re.compile(r'.*twitter|facebook|instagram|tiktok*.*', re.IGNORECASE), re.compile(r'.*(ue|ü)ber.?uns*.*', re.IGNORECASE), re.compile(r'.*unterricht*.*', re.IGNORECASE), re.compile(r'.*versand*.*', re.IGNORECASE), re.compile(r'.*verbraucherschutz*.*', re.IGNORECASE), re.compile(r'.*warenkorb*.*', re.IGNORECASE), re.compile(r'.*wegbeschreibung*.*', re.IGNORECASE), re.compile(r'.*widerrufsbelehrung*.*', re.IGNORECASE), re.compile(r'.*wishlist*.*', re.IGNORECASE), re.compile(r'.*zahlung*.*', re.IGNORECASE), re.compile(r'.*.jpeg', re.IGNORECASE), ]