manaviel85370
reactivate maps crawling
5f8d317
from src.crawler.CrawlerV2 import Crawler
from src.crawler.crawler_service import *
from src.crawler.utils.maps_types import MAPS_TYPES
from src.crawler.maps_api import get_maps_results
from src.persistence.db import *
import random
import streamlit_nested_layout
@st.cache_resource
def init_connection():
return init_db()
def crawl(item):
results =[]
try:
st.info(f"Crawle {item['url']}")
if "overview_pages" not in item:
crawler = Crawler(item["url"], item["url_type"], depth=2)
results = crawler.crawl()
except Exception as e:
st.error(f"Fehler beim crawlen: {e}")
db.unsorted_urls.delete_one({"_id":item["_id"]})
return
# Übersicht-Seiten erkennen
overview_regex = re.compile(
r"^https?:\/\/([a-zA-Z0-9.-]*\/)*(?!(advent))(kalender|.*veranstaltungen|veranstaltungskalender|.*events?|.*event-?kalender|([a-zA-Z]*)?programm|gottesdienste|auff(ü|ue)hrungen|termine|spielplan)(\/?|(\/?[a-zA-Z]*)\.[a-zA-Z]*)?$",
re.IGNORECASE)
overview_pages = set()
# URLs sortieren
sub_urls=[]
for url in results:
if overview_regex.match(url):
overview_pages.add(url)
else:
sub_urls.append(url)
if not overview_pages:
overview_regex = re.compile(
r"^https?:\/\/([a-zA-Z0-9.-]*\/)*(?!(advent))(kalender|.*veranstaltungen|veranstaltungskalender|.*events?|.*event-?kalender|([a-zA-Z]*)?programm|gottesdienste|auff(ü|ue)hrungen|termine|spielplan)(\/?|(\/?[a-zA-Z]*)\.[a-zA-Z]*)?",
re.IGNORECASE)
for url in results:
match = overview_regex.search(url)
if match:
overview_pages.add(match.group())
overview_pages = {url.casefold() for url in overview_pages}
with st.expander("Gefundene Suburls"):
for url in sub_urls:
st.write(url)
with st.expander("Gefundene Übersichtsseiten:"):
for url in overview_pages:
st.write(url)
# Update DB entry
new_values = {"$set": {"crawled": True}}
if overview_pages:
new_values["$set"]["overview_pages"] = list(overview_pages)
if sub_urls:
item["sub_urls"] = sub_urls
new_values["$set"]["sub_urls"] = sub_urls
db.unsorted_urls.update_one({"_id":item["_id"]}, new_values)
print(db.unsorted_urls.find_one({"_id":item["_id"]}))
db = init_connection()
# content
st.title("Event-Urls-Suche mit Crawler und Google API")
st.write("""
Wähle aus für wie viele Urls der **Crawler** gestartert werden soll. Diese werden zufällig aus den noch nicht gecrawlten Urls aus der DB ausgewählt.
Wenn **"Google Maps Ergebnisse finden"** aktiviert ist, werden bei den Stadtportalen zusätzlich noch neue Veranstaltungsorte gesucht.""")
with st.form("Crawler Settings"):
count = st.number_input("Wie viele URLs sollen gecrawled werden?", step=1)
maps = st.checkbox("Google Maps Ergebnisse finden")
# Every form must have a submit button.
submitted = st.form_submit_button("Starte Crawler")
if submitted:
for i in range(count):
item = db.unsorted_urls.find_one({"crawled": None })
with st.expander(f"Ergebnisse für {item['url']} in {item['meta']['location']}"):
if item["url_type"] == "city" and maps:
for type_id in random.sample(MAPS_TYPES, 5):
print(item)
if "maps_searches" not in item or "maps_searches" in item and type_id not in item["maps_searches"]:
st.info(f"Suche Maps Ergebnisse für {type_id} in {item['meta']['location']}")
try:
maps_results = get_maps_results(type_id, item["meta"]["location"])
if maps_results:
new_elements = []
with st.expander("Maps Ergebnisse"):
for result in maps_results:
if result.website_uri \
and "facebook" not in result.website_uri \
and "instagram" not in result.website_uri \
and "tiktok" not in result.website_uri \
and result.website_uri not in [e["url"] for e in new_elements]:
element = {
"url_type": type_id,
"url": result.website_uri,
"meta":{
"website_host": result.display_name.text,
"location": result.formatted_address.split(", ")[1],
"address": result.formatted_address,
"maps_types": list(result.types)
}}
st.write(f"{element['meta']['website_host']} - {element['url']}")
new_elements.append(element)
if new_elements:
db.unsorted_urls.insert_many(new_elements)
except Exception as e:
st.error(e)
if "maps_searches" in item:
maps_searches = item["maps_searches"]
maps_searches.append(type_id)
item["maps_searches"] = maps_searches
else:
item["maps_searches"] = [type_id]
else:
st.success("Maps Ergebnisse bereits in DB")
crawl(item)