File size: 6,336 Bytes
da88570 5f8d317 da88570 5f8d317 da88570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from src.crawler.CrawlerV2 import Crawler
from src.crawler.crawler_service import *
from src.crawler.utils.maps_types import MAPS_TYPES
from src.crawler.maps_api import get_maps_results
from src.persistence.db import *
import random
import streamlit_nested_layout
@st.cache_resource
def init_connection():
return init_db()
def crawl(item):
results =[]
try:
st.info(f"Crawle {item['url']}")
if "overview_pages" not in item:
crawler = Crawler(item["url"], item["url_type"], depth=2)
results = crawler.crawl()
except Exception as e:
st.error(f"Fehler beim crawlen: {e}")
db.unsorted_urls.delete_one({"_id":item["_id"]})
return
# Übersicht-Seiten erkennen
overview_regex = re.compile(
r"^https?:\/\/([a-zA-Z0-9.-]*\/)*(?!(advent))(kalender|.*veranstaltungen|veranstaltungskalender|.*events?|.*event-?kalender|([a-zA-Z]*)?programm|gottesdienste|auff(ü|ue)hrungen|termine|spielplan)(\/?|(\/?[a-zA-Z]*)\.[a-zA-Z]*)?$",
re.IGNORECASE)
overview_pages = set()
# URLs sortieren
sub_urls=[]
for url in results:
if overview_regex.match(url):
overview_pages.add(url)
else:
sub_urls.append(url)
if not overview_pages:
overview_regex = re.compile(
r"^https?:\/\/([a-zA-Z0-9.-]*\/)*(?!(advent))(kalender|.*veranstaltungen|veranstaltungskalender|.*events?|.*event-?kalender|([a-zA-Z]*)?programm|gottesdienste|auff(ü|ue)hrungen|termine|spielplan)(\/?|(\/?[a-zA-Z]*)\.[a-zA-Z]*)?",
re.IGNORECASE)
for url in results:
match = overview_regex.search(url)
if match:
overview_pages.add(match.group())
overview_pages = {url.casefold() for url in overview_pages}
with st.expander("Gefundene Suburls"):
for url in sub_urls:
st.write(url)
with st.expander("Gefundene Übersichtsseiten:"):
for url in overview_pages:
st.write(url)
# Update DB entry
new_values = {"$set": {"crawled": True}}
if overview_pages:
new_values["$set"]["overview_pages"] = list(overview_pages)
if sub_urls:
item["sub_urls"] = sub_urls
new_values["$set"]["sub_urls"] = sub_urls
db.unsorted_urls.update_one({"_id":item["_id"]}, new_values)
print(db.unsorted_urls.find_one({"_id":item["_id"]}))
db = init_connection()
# content
st.title("Event-Urls-Suche mit Crawler und Google API")
st.write("""
Wähle aus für wie viele Urls der **Crawler** gestartert werden soll. Diese werden zufällig aus den noch nicht gecrawlten Urls aus der DB ausgewählt.
Wenn **"Google Maps Ergebnisse finden"** aktiviert ist, werden bei den Stadtportalen zusätzlich noch neue Veranstaltungsorte gesucht.""")
with st.form("Crawler Settings"):
count = st.number_input("Wie viele URLs sollen gecrawled werden?", step=1)
maps = st.checkbox("Google Maps Ergebnisse finden")
# Every form must have a submit button.
submitted = st.form_submit_button("Starte Crawler")
if submitted:
for i in range(count):
item = db.unsorted_urls.find_one({"crawled": None })
with st.expander(f"Ergebnisse für {item['url']} in {item['meta']['location']}"):
if item["url_type"] == "city" and maps:
for type_id in random.sample(MAPS_TYPES, 5):
print(item)
if "maps_searches" not in item or "maps_searches" in item and type_id not in item["maps_searches"]:
st.info(f"Suche Maps Ergebnisse für {type_id} in {item['meta']['location']}")
try:
maps_results = get_maps_results(type_id, item["meta"]["location"])
if maps_results:
new_elements = []
with st.expander("Maps Ergebnisse"):
for result in maps_results:
if result.website_uri \
and "facebook" not in result.website_uri \
and "instagram" not in result.website_uri \
and "tiktok" not in result.website_uri \
and result.website_uri not in [e["url"] for e in new_elements]:
element = {
"url_type": type_id,
"url": result.website_uri,
"meta":{
"website_host": result.display_name.text,
"location": result.formatted_address.split(", ")[1],
"address": result.formatted_address,
"maps_types": list(result.types)
}}
st.write(f"{element['meta']['website_host']} - {element['url']}")
new_elements.append(element)
if new_elements:
db.unsorted_urls.insert_many(new_elements)
except Exception as e:
st.error(e)
if "maps_searches" in item:
maps_searches = item["maps_searches"]
maps_searches.append(type_id)
item["maps_searches"] = maps_searches
else:
item["maps_searches"] = [type_id]
else:
st.success("Maps Ergebnisse bereits in DB")
crawl(item)
|