|
from src.crawler.CrawlerV2 import Crawler |
|
from src.crawler.crawler_service import * |
|
from src.crawler.utils.maps_types import MAPS_TYPES |
|
from src.crawler.maps_api import get_maps_results |
|
from src.persistence.db import * |
|
import random |
|
import streamlit_nested_layout |
|
|
|
|
|
@st.cache_resource |
|
def init_connection(): |
|
return init_db() |
|
|
|
def crawl(item): |
|
results =[] |
|
try: |
|
st.info(f"Crawle {item['url']}") |
|
|
|
if "overview_pages" not in item: |
|
crawler = Crawler(item["url"], item["url_type"], depth=2) |
|
results = crawler.crawl() |
|
except Exception as e: |
|
st.error(f"Fehler beim crawlen: {e}") |
|
db.unsorted_urls.delete_one({"_id":item["_id"]}) |
|
return |
|
|
|
|
|
overview_regex = re.compile( |
|
r"^https?:\/\/([a-zA-Z0-9.-]*\/)*(?!(advent))(kalender|.*veranstaltungen|veranstaltungskalender|.*events?|.*event-?kalender|([a-zA-Z]*)?programm|gottesdienste|auff(ü|ue)hrungen|termine|spielplan)(\/?|(\/?[a-zA-Z]*)\.[a-zA-Z]*)?$", |
|
re.IGNORECASE) |
|
overview_pages = set() |
|
|
|
|
|
sub_urls=[] |
|
for url in results: |
|
if overview_regex.match(url): |
|
overview_pages.add(url) |
|
else: |
|
sub_urls.append(url) |
|
if not overview_pages: |
|
overview_regex = re.compile( |
|
r"^https?:\/\/([a-zA-Z0-9.-]*\/)*(?!(advent))(kalender|.*veranstaltungen|veranstaltungskalender|.*events?|.*event-?kalender|([a-zA-Z]*)?programm|gottesdienste|auff(ü|ue)hrungen|termine|spielplan)(\/?|(\/?[a-zA-Z]*)\.[a-zA-Z]*)?", |
|
re.IGNORECASE) |
|
for url in results: |
|
match = overview_regex.search(url) |
|
if match: |
|
overview_pages.add(match.group()) |
|
overview_pages = {url.casefold() for url in overview_pages} |
|
|
|
with st.expander("Gefundene Suburls"): |
|
for url in sub_urls: |
|
st.write(url) |
|
with st.expander("Gefundene Übersichtsseiten:"): |
|
for url in overview_pages: |
|
st.write(url) |
|
|
|
|
|
new_values = {"$set": {"crawled": True}} |
|
if overview_pages: |
|
new_values["$set"]["overview_pages"] = list(overview_pages) |
|
if sub_urls: |
|
item["sub_urls"] = sub_urls |
|
new_values["$set"]["sub_urls"] = sub_urls |
|
|
|
db.unsorted_urls.update_one({"_id":item["_id"]}, new_values) |
|
print(db.unsorted_urls.find_one({"_id":item["_id"]})) |
|
|
|
db = init_connection() |
|
|
|
|
|
st.title("Event-Urls-Suche mit Crawler und Google API") |
|
st.write(""" |
|
Wähle aus für wie viele Urls der **Crawler** gestartert werden soll. Diese werden zufällig aus den noch nicht gecrawlten Urls aus der DB ausgewählt. |
|
Wenn **"Google Maps Ergebnisse finden"** aktiviert ist, werden bei den Stadtportalen zusätzlich noch neue Veranstaltungsorte gesucht.""") |
|
with st.form("Crawler Settings"): |
|
count = st.number_input("Wie viele URLs sollen gecrawled werden?", step=1) |
|
maps = st.checkbox("Google Maps Ergebnisse finden") |
|
|
|
submitted = st.form_submit_button("Starte Crawler") |
|
if submitted: |
|
for i in range(count): |
|
item = db.unsorted_urls.find_one({"crawled": None }) |
|
with st.expander(f"Ergebnisse für {item['url']} in {item['meta']['location']}"): |
|
|
|
if item["url_type"] == "city" and maps: |
|
for type_id in random.sample(MAPS_TYPES, 5): |
|
print(item) |
|
if "maps_searches" not in item or "maps_searches" in item and type_id not in item["maps_searches"]: |
|
st.info(f"Suche Maps Ergebnisse für {type_id} in {item['meta']['location']}") |
|
try: |
|
maps_results = get_maps_results(type_id, item["meta"]["location"]) |
|
if maps_results: |
|
new_elements = [] |
|
with st.expander("Maps Ergebnisse"): |
|
for result in maps_results: |
|
if result.website_uri \ |
|
and "facebook" not in result.website_uri \ |
|
and "instagram" not in result.website_uri \ |
|
and "tiktok" not in result.website_uri \ |
|
and result.website_uri not in [e["url"] for e in new_elements]: |
|
element = { |
|
"url_type": type_id, |
|
"url": result.website_uri, |
|
"meta":{ |
|
"website_host": result.display_name.text, |
|
"location": result.formatted_address.split(", ")[1], |
|
"address": result.formatted_address, |
|
"maps_types": list(result.types) |
|
}} |
|
st.write(f"{element['meta']['website_host']} - {element['url']}") |
|
new_elements.append(element) |
|
if new_elements: |
|
db.unsorted_urls.insert_many(new_elements) |
|
except Exception as e: |
|
st.error(e) |
|
|
|
if "maps_searches" in item: |
|
maps_searches = item["maps_searches"] |
|
maps_searches.append(type_id) |
|
item["maps_searches"] = maps_searches |
|
else: |
|
item["maps_searches"] = [type_id] |
|
else: |
|
st.success("Maps Ergebnisse bereits in DB") |
|
|
|
crawl(item) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|