manaviel85370
add pages and all
da88570
import pandas as pd
import spacy
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
stop_words = ["a","ab","aber","ach","acht","achte","achten","achter","achtes","ag","alle","allein","allem","allen","aller","allerdings","alles","allgemeinen","als","also","am","an","ander","andere","anderem","anderen","anderer","anderes","anderm","andern","anderr","anders","au","auch","auf","aus","ausser","ausserdem","außer","außerdem","b","bald","bei","beide","beiden","beim","beispiel","bekannt","bereits","besonders","besser","besten","bin","bis","bisher","bist","c","d","d.h","da","dabei","dadurch","dafür","dagegen","daher","dahin","dahinter","damals","damit","danach","daneben","dank","dann","daran","darauf","daraus","darf","darfst","darin","darum","darunter","darüber","das","dasein","daselbst","dass","dasselbe","davon","davor","dazu","dazwischen","daß","dein","deine","deinem","deinen","deiner","deines","dem","dementsprechend","demgegenüber","demgemäss","demgemäß","demselben","demzufolge","den","denen","denn","denselben","der","deren","derer","derjenige","derjenigen","dermassen","dermaßen","derselbe","derselben","des","deshalb","desselben","dessen","deswegen","dich","die","diejenige","diejenigen","dies","diese","dieselbe","dieselben","diesem","diesen","dieser","dieses","dir","doch","dort","drei","drin","dritte","dritten","dritter","drittes","du","durch","durchaus","durfte","durften","dürfen","dürft","e","eben","ebenso","ehrlich","ei","ei,","eigen","eigene","eigenen","eigener","eigenes","ein","einander","eine","einem","einen","einer","eines","einig","einige","einigem","einigen","einiger","einiges","einmal","eins","elf","en","ende","endlich","entweder","er","ernst","erst","erste","ersten","erster","erstes","es","etwa","etwas","euch","euer","eure","eurem","euren","eurer","eures","f","folgende","früher","fünf","fünfte","fünften","fünfter","fünftes","für","g","gab","ganz","ganze","ganzen","ganzer","ganzes","gar","gedurft","gegen","gegenüber","gehabt","gehen","geht","gekannt","gekonnt","gemacht","gemocht","gemusst","genug","gerade","gern","gesagt","geschweige","gewesen","gewollt","geworden","gibt","ging","gleich","gott","gross","grosse","grossen","grosser","grosses","groß","große","großen","großer","großes","gut","gute","guter","gutes","h","hab","habe","haben","habt","hast","hat","hatte","hatten","hattest","hattet","heisst","her","heute","hier","hin","hinter","hoch","hätte","hätten","i","ich","ihm","ihn","ihnen","ihr","ihre","ihrem","ihren","ihrer","ihres","im","immer","in","indem","infolgedessen","ins","irgend","ist","j","ja","jahr","jahre","jahren","je","jede","jedem","jeden","jeder","jedermann","jedermanns","jedes","jedoch","jemand","jemandem","jemanden","jene","jenem","jenen","jener","jenes","jetzt","k","kam","kann","kannst","kaum","kein","keine","keinem","keinen","keiner","keines","kleine","kleinen","kleiner","kleines","kommen","kommt","konnte","konnten","kurz","können","könnt","könnte","l","lang","lange","leicht","leide","lieber","los","m","machen","macht","machte","mag","magst","mahn","mal","man","manche","manchem","manchen","mancher","manches","mann","mehr","mein","meine","meinem","meinen","meiner","meines","mensch","menschen","mich","mir","mit","mittel","mochte","mochten","morgen","muss","musst","musste","mussten","muß","mußt","möchte","mögen","möglich","mögt","müssen","müsst","müßt","n","na","nach","nachdem","nahm","natürlich","neben","nein","neue","neuen","neun","neunte","neunten","neunter","neuntes","nicht","nichts","nie","niemand","niemandem","niemanden","noch","nun","nur","o","ob","oben","oder","offen","oft","ohne","ordnung","p","q","r","recht","rechte","rechten","rechter","rechtes","richtig","rund","s","sa","sache","sagt","sagte","sah","satt","schlecht","schluss","schon","sechs","sechste","sechsten","sechster","sechstes","sehr","sei","seid","seien","sein","seine","seinem","seinen","seiner","seines","seit","seitdem","selbst","sich","sie","sieben","siebente","siebenten","siebenter","siebentes","sind","so","solang","solche","solchem","solchen","solcher","solches","soll","sollen","sollst","sollt","sollte","sollten","sondern","sonst","soweit","sowie","später","startseite","statt","steht","suche","t","tag","tage","tagen","tat","teil","tel","tritt","trotzdem","tun","u","uhr","um","und","uns","unse","unsem","unsen","unser","unsere","unserer","unses","unter","v","vergangenen","viel","viele","vielem","vielen","vielleicht","vier","vierte","vierten","vierter","viertes","vom","von","vor","w","wahr","wann","war","waren","warst","wart","warum","was","weg","wegen","weil","weit","weiter","weitere","weiteren","weiteres","welche","welchem","welchen","welcher","welches","wem","wen","wenig","wenige","weniger","weniges","wenigstens","wenn","wer","werde","werden","werdet","weshalb","wessen","wie","wieder","wieso","will","willst","wir","wird","wirklich","wirst","wissen","wo","woher","wohin","wohl","wollen","wollt","wollte","wollten","worden","wurde","wurden","während","währenddem","währenddessen","wäre","würde","würden","x","y","z","z.b","zehn","zehnte","zehnten","zehnter","zehntes","zeit","zu","zuerst","zugleich","zum","zunächst","zur","zurück","zusammen","zwanzig","zwar","zwei","zweite","zweiten","zweiter","zweites","zwischen","zwölf","über","überhaupt","übrigens"]
data = [
"""
Konzertabend
Termine anzeigen
Lassen Sie sich von unserer Musikschule mit einem wunderbaren Konzertabend begeistern! Unsere talentierten Schüler und Schülerinnen spielen Stücke von Bach, Beethoven und Mozart. Eintritt frei.
Datum: Samstag, 27. Januar 2025
Uhrzeit: 19:00 Uhr
Ort: Stadthalle, Hauptstraße 12, 12345 Musterstadt
""",
"""
Yoga-Workshop
Termine anzeigen
Finden Sie Ihre innere Balance in unserem wöchentlichen Yoga-Workshop! Geeignet für Anfänger und Fortgeschrittene. Bitte bequeme Kleidung und eine Yoga-Matte mitbringen.
Datum: Jeden Mittwoch
Uhrzeit: 18:30 - 20:00 Uhr
Ort: Gemeinschaftshaus, Raum 3, Parkweg 5, 67890 Beispielstadt
Kosten: 10€ pro Sitzung
""",
"""
Kinderfest
Termine anzeigen
Ein Tag voller Spaß und Abenteuer! Unser Kinderfest bietet Spielstationen, eine Hüpfburg, Bastelaktionen und ein Kinderschminken. Für das leibliche Wohl ist mit Getränken und Snacks gesorgt.
Datum: Sonntag, 5. Mai 2025
Uhrzeit: 10:00 - 16:00 Uhr
Ort: Sportplatz, Schulstraße 20, 98765 Spaßhausen
""",
"""
Autorenlesung
Termine anzeigen
Die Bestsellerautorin Anna Mustermann liest aus ihrem neuen Buch "Geschichten aus dem Alltag". Erleben Sie einen literarischen Abend voller Inspiration und spannender Geschichten.
Datum: Freitag, 14. Februar 2025
Uhrzeit: 19:30 Uhr
Ort: Stadtbibliothek, Lesesaal, Bücherweg 7, 54321 Bücherstadt
Eintritt: 5€
""",
"""
Flohmarkt
Termine anzeigen
Stöbern, feilschen und entdecken! Unser Flohmarkt bietet eine Vielzahl an Ständen mit Kleidung, Büchern, Spielzeug und vielem mehr. Standanmeldung unter: [email protected].
Datum: Samstag, 3. Juni 2025
Uhrzeit: 08:00 - 14:00 Uhr
Ort: Marktplatz, Innenstadt, 11223 Schnäppchenhausen
""",
"""
Filmabend
Termine anzeigen
Genießen Sie einen gemütlichen Filmabend mit Popcorn und Getränken. Gezeigt wird der Film "Der geheime Garten". Für Kinder und Erwachsene geeignet.
Datum: Freitag, 10. März 2025
Uhrzeit: 20:00 Uhr
Ort: Gemeindezentrum, Kinoraum, Hauptstraße 5, 44567 Filmstadt
Eintritt: Spende erbeten
"""
]
docs = data * 170
print(len(docs))
nlp = spacy.load('de_core_news_lg', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
vectorizer_model = CountVectorizer(stop_words=stop_words)
topic_model = BERTopic(embedding_model=nlp, vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)
topic_model.save("models/bertopic_model", serialization="safetensors", save_ctfidf=True)
fig = topic_model.visualize_topics()
fig.show()