File size: 6,336 Bytes
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f8d317
da88570
 
 
 
 
 
 
 
 
 
 
 
5f8d317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from src.crawler.CrawlerV2 import Crawler
from src.crawler.crawler_service import *
from src.crawler.utils.maps_types import MAPS_TYPES
from src.crawler.maps_api import get_maps_results
from src.persistence.db import *
import random
import streamlit_nested_layout


@st.cache_resource
def init_connection():
    return init_db()

def crawl(item):
        results =[]
        try:
            st.info(f"Crawle {item['url']}")

            if "overview_pages" not in item:
                crawler = Crawler(item["url"], item["url_type"], depth=2)
                results = crawler.crawl()
        except Exception as e:
            st.error(f"Fehler beim crawlen: {e}")
            db.unsorted_urls.delete_one({"_id":item["_id"]})
            return

        # Übersicht-Seiten erkennen
        overview_regex = re.compile(
            r"^https?:\/\/([a-zA-Z0-9.-]*\/)*(?!(advent))(kalender|.*veranstaltungen|veranstaltungskalender|.*events?|.*event-?kalender|([a-zA-Z]*)?programm|gottesdienste|auff(ü|ue)hrungen|termine|spielplan)(\/?|(\/?[a-zA-Z]*)\.[a-zA-Z]*)?$",
            re.IGNORECASE)
        overview_pages = set()
        # URLs sortieren

        sub_urls=[]
        for url in results:
            if overview_regex.match(url):
                overview_pages.add(url)
            else:
                sub_urls.append(url)
        if not overview_pages:
            overview_regex = re.compile(
                r"^https?:\/\/([a-zA-Z0-9.-]*\/)*(?!(advent))(kalender|.*veranstaltungen|veranstaltungskalender|.*events?|.*event-?kalender|([a-zA-Z]*)?programm|gottesdienste|auff(ü|ue)hrungen|termine|spielplan)(\/?|(\/?[a-zA-Z]*)\.[a-zA-Z]*)?",
                re.IGNORECASE)
            for url in results:
                match = overview_regex.search(url)
                if match:
                    overview_pages.add(match.group())
        overview_pages = {url.casefold() for url in overview_pages}

        with st.expander("Gefundene Suburls"):
            for url in sub_urls:
                st.write(url)
        with st.expander("Gefundene Übersichtsseiten:"):
            for url in overview_pages:
                st.write(url)

        # Update DB entry
        new_values = {"$set": {"crawled": True}}
        if overview_pages:
            new_values["$set"]["overview_pages"] = list(overview_pages)
        if sub_urls:
            item["sub_urls"] = sub_urls
            new_values["$set"]["sub_urls"] = sub_urls

        db.unsorted_urls.update_one({"_id":item["_id"]}, new_values)
        print(db.unsorted_urls.find_one({"_id":item["_id"]}))

db = init_connection()

# content
st.title("Event-Urls-Suche mit Crawler und Google API")
st.write("""
    Wähle aus für wie viele Urls der **Crawler** gestartert werden soll. Diese werden zufällig aus den noch nicht gecrawlten Urls aus der DB ausgewählt.
    Wenn **"Google Maps Ergebnisse finden"** aktiviert ist, werden bei den Stadtportalen zusätzlich noch neue Veranstaltungsorte gesucht.""")
with st.form("Crawler Settings"):
    count = st.number_input("Wie viele URLs sollen gecrawled werden?", step=1)
    maps = st.checkbox("Google Maps Ergebnisse finden")
    # Every form must have a submit button.
    submitted = st.form_submit_button("Starte Crawler")
    if submitted:
        for i in range(count):
            item = db.unsorted_urls.find_one({"crawled": None })
            with st.expander(f"Ergebnisse für {item['url']} in {item['meta']['location']}"):

                if item["url_type"] == "city" and maps:
                    for type_id in random.sample(MAPS_TYPES, 5):
                        print(item)
                        if "maps_searches" not in item or "maps_searches" in item and type_id not in item["maps_searches"]:
                            st.info(f"Suche Maps Ergebnisse für {type_id} in {item['meta']['location']}")
                            try:
                                maps_results = get_maps_results(type_id, item["meta"]["location"])
                                if maps_results:
                                    new_elements = []
                                    with st.expander("Maps Ergebnisse"):
                                        for result in maps_results:
                                            if result.website_uri \
                                                    and "facebook" not in result.website_uri \
                                                    and "instagram" not in result.website_uri \
                                                    and "tiktok" not in result.website_uri \
                                                    and result.website_uri not in [e["url"] for e in new_elements]:
                                                element = {
                                                    "url_type": type_id,
                                                    "url": result.website_uri,
                                                    "meta":{
                                                        "website_host": result.display_name.text,
                                                        "location": result.formatted_address.split(", ")[1],
                                                        "address": result.formatted_address,
                                                        "maps_types": list(result.types)
                                                    }}
                                                st.write(f"{element['meta']['website_host']} - {element['url']}")
                                                new_elements.append(element)
                                        if new_elements:
                                            db.unsorted_urls.insert_many(new_elements)
                            except Exception as e:
                                st.error(e)

                            if "maps_searches" in item:
                                maps_searches = item["maps_searches"]
                                maps_searches.append(type_id)
                                item["maps_searches"] = maps_searches
                            else:
                                item["maps_searches"] =  [type_id]
                        else:
                            st.success("Maps Ergebnisse bereits in DB")

                crawl(item)