File size: 9,666 Bytes
da88570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import spacy
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sklearn.metrics.pairwise import cosine_similarity
from src.nlp.experimental.topic_modeling_data import DATA
from src.nlp.playground.textsummarization import SumySummarizer
import webbrowser
stop_words = ["a","ab","aber","ach","acht","achte","achten","achter","achtes","ag","alle","allein","allem","allen","aller","allerdings","alles","allgemeinen","als","also","am","an","ander","andere","anderem","anderen","anderer","anderes","anderm","andern","anderr","anders","au","auch","auf","aus","ausser","ausserdem","außer","außerdem","b","bald","bei","beide","beiden","beim","beispiel","bekannt","bereits","besonders","besser","besten","bin","bis","bisher","bist","c","d","d.h","da","dabei","dadurch","dafür","dagegen","daher","dahin","dahinter","damals","damit","danach","daneben","dank","dann","daran","darauf","daraus","darf","darfst","darin","darum","darunter","darüber","das","dasein","daselbst","dass","dasselbe","davon","davor","dazu","dazwischen","daß","dein","deine","deinem","deinen","deiner","deines","dem","dementsprechend","demgegenüber","demgemäss","demgemäß","demselben","demzufolge","den","denen","denn","denselben","der","deren","derer","derjenige","derjenigen","dermassen","dermaßen","derselbe","derselben","des","deshalb","desselben","dessen","deswegen","dich","die","diejenige","diejenigen","dies","diese","dieselbe","dieselben","diesem","diesen","dieser","dieses","dir","doch","dort","drei","drin","dritte","dritten","dritter","drittes","du","durch","durchaus","durfte","durften","dürfen","dürft","e","eben","ebenso","ehrlich","ei","ei,","eigen","eigene","eigenen","eigener","eigenes","ein","einander","eine","einem","einen","einer","eines","einig","einige","einigem","einigen","einiger","einiges","einmal","eins","elf","en","ende","endlich","entweder","er","ernst","erst","erste","ersten","erster","erstes","es","etwa","etwas","euch","euer","eure","eurem","euren","eurer","eures","f","folgende","früher","fünf","fünfte","fünften","fünfter","fünftes","für","g","gab","ganz","ganze","ganzen","ganzer","ganzes","gar","gedurft","gegen","gegenüber","gehabt","gehen","geht","gekannt","gekonnt","gemacht","gemocht","gemusst","genug","gerade","gern","gesagt","geschweige","gewesen","gewollt","geworden","gibt","ging","gleich","gott","gross","grosse","grossen","grosser","grosses","groß","große","großen","großer","großes","gut","gute","guter","gutes","h","hab","habe","haben","habt","hast","hat","hatte","hatten","hattest","hattet","heisst","her","heute","hier","hin","hinter","hoch","hätte","hätten","i","ich","ihm","ihn","ihnen","ihr","ihre","ihrem","ihren","ihrer","ihres","im","immer","in","indem","infolgedessen","ins","irgend","ist","j","ja","jahr","jahre","jahren","je","jede","jedem","jeden","jeder","jedermann","jedermanns","jedes","jedoch","jemand","jemandem","jemanden","jene","jenem","jenen","jener","jenes","jetzt","k","kam","kann","kannst","kaum","kein","keine","keinem","keinen","keiner","keines","kleine","kleinen","kleiner","kleines","kommen","kommt","konnte","konnten","kurz","können","könnt","könnte","l","lang","lange","leicht","leide","lieber","los","m","machen","macht","machte","mag","magst","mahn","mal","man","manche","manchem","manchen","mancher","manches","mann","mehr","mein","meine","meinem","meinen","meiner","meines","mensch","menschen","mich","mir","mit","mittel","mochte","mochten","morgen","muss","musst","musste","mussten","muß","mußt","möchte","mögen","möglich","mögt","müssen","müsst","müßt","n","na","nach","nachdem","nahm","natürlich","neben","nein","neue","neuen","neun","neunte","neunten","neunter","neuntes","nicht","nichts","nie","niemand","niemandem","niemanden","noch","nun","nur","o","ob","oben","oder","offen","oft","ohne","ordnung","p","q","r","recht","rechte","rechten","rechter","rechtes","richtig","rund","s","sa","sache","sagt","sagte","sah","satt","schlecht","schluss","schon","sechs","sechste","sechsten","sechster","sechstes","sehr","sei","seid","seien","sein","seine","seinem","seinen","seiner","seines","seit","seitdem","selbst","sich","sie","sieben","siebente","siebenten","siebenter","siebentes","sind","so","solang","solche","solchem","solchen","solcher","solches","soll","sollen","sollst","sollt","sollte","sollten","sondern","sonst","soweit","sowie","später","startseite","statt","steht","suche","t","tag","tage","tagen","tat","teil","tel","tritt","trotzdem","tun","u","uhr","um","und","uns","unse","unsem","unsen","unser","unsere","unserer","unses","unter","v","vergangenen","viel","viele","vielem","vielen","vielleicht","vier","vierte","vierten","vierter","viertes","vom","von","vor","w","wahr","wann","war","waren","warst","wart","warum","was","weg","wegen","weil","weit","weiter","weitere","weiteren","weiteres","welche","welchem","welchen","welcher","welches","wem","wen","wenig","wenige","weniger","weniges","wenigstens","wenn","wer","werde","werden","werdet","weshalb","wessen","wie","wieder","wieso","will","willst","wir","wird","wirklich","wirst","wissen","wo","woher","wohin","wohl","wollen","wollt","wollte","wollten","worden","wurde","wurden","während","währenddem","währenddessen","wäre","würde","würden","x","y","z","z.b","zehn","zehnte","zehnten","zehnter","zehntes","zeit","zu","zuerst","zugleich","zum","zunächst","zur","zurück","zusammen","zwanzig","zwar","zwei","zweite","zweiten","zweiter","zweites","zwischen","zwölf","über","überhaupt","übrigens","wann", "wo", "datum", "kalender", "termin", "veranstaltungsort",
"eintritt", "uhr", "tickets", "datum", "termin", "termine", "veranstaltung","veranstaltungen"
"am", "um", "bis", "ab", "von", "mit", "mehr",
"Januar", "Februar", "März", "April", "Mai", "Juni",
"Juli", "August", "September", "Oktober", "November", "Dezember",
"Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"
]
data = DATA
print(len(data))
summarizer = SumySummarizer()
data = [" ".join(summarizer.summarize(d)) for d in data]
# Preprocessing: Remove entities, and all tokens that include other characters that letters, except for "-"
nlp = spacy.load("de_core_news_sm")
# cleaned_docs = []
# for doc in data:
# doc_spacy = nlp(doc)
#
# cleaned_doc = " ".join([token.text for token in doc_spacy
# if token.ent_type_ == ""
# and len(token.text) > 2
# and (token.is_alpha or '-' in token.text)])
#
# cleaned_docs.append(cleaned_doc)
#
# for i, cleaned in enumerate(cleaned_docs):
# print(f"Bereinigtes Dokument {i+1}: {cleaned}")
# We select a subsample of 5000 abstracts from ArXiv
# docs = cleaned_docs
docs = data
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# matryoshka_dim = 512
# embedding_model = SentenceTransformer("aari1995/German_Semantic_V3", trust_remote_code=True, truncate_dim=matryoshka_dim)
embeddings = embedding_model.encode(docs,batch_size=256, show_progress_bar=True)
vectorizer_model = CountVectorizer(stop_words=stop_words, max_features=10000)
# We define a number of topics that we know are in the documents
zeroshot_topic_list = [
"Ausstellung",
"Charity-Event",
"Comedy",
"Dinner-Show",
"Dokumentation",
"Eröffnung",
"Familie",
"Feier",
"Filmfestival",
"Filmvorführung",
"Gaming",
"Gesprächsabend",
"Gottesdienst",
"Infoveranstaltung",
"Kabarett",
"Kinder",
"Kochkurs",
"Konferenz",
"Konzert",
"Kultur",
"Kunst",
"Lesung",
"Markt",
"Messe",
"Modenschau",
"Museum",
"Musical",
"Onlinekurs",
"Oper",
"Party",
"Performance",
"Religion",
"Seminar",
"Sport",
"Startup",
"Tanz",
"Tech",
"Theater",
"Vortrag",
"Webinar",
"Workshop"
]
# We fit our model using the zero-shot topics
# and we define a minimum similarity. For each document,
# if the similarity does not exceed that value, it will be used
# for clustering instead.
topic_model = BERTopic(
language="de",
embedding_model=embedding_model,
min_topic_size=5,
zeroshot_topic_list=zeroshot_topic_list,
zeroshot_min_similarity=.85,
representation_model=KeyBERTInspired(),
vectorizer_model=vectorizer_model,
verbose=True,
)
topic_model = topic_model.fit(docs)
topic_distr, _ = topic_model.approximate_distribution(docs)
fig = topic_model.visualize_distribution(topic_distr[1])
print(fig)
fig.write_html("plot.html")
webbrowser.open("plot.html")
# topics, _ = topic_model.fit_transform(docs,embeddings)
#
# fig = topic_model.visualize_topics()
# topic_info = topic_model.get_topic_info()
# topic_info.to_html("topic_info.html")
# fig.show()
#
#
#
# docs_and_topics = list(zip(docs, topics))
#
# # Sortieren nach topic
# docs_and_topics.sort(key=lambda x: x[1])
#
# # Durchlaufen der sortierten Liste und Ausgeben der Dokumente nach Topic
# current_topic = None
# for doc, topic in docs_and_topics:
# if topic != current_topic:
# # Neues Topic gefunden, Ausgabe des Themas
# current_topic = topic
# print(f"\nTopic: {topic} {topic_model.get_topic(topic)}")
# print(f"→ Dokument: {doc}")
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
|