from classy_classification import ClassyClassifier
import pickle
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
from nltk import Tree
from src.nlp.data.test_texts import TEXTS
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
from src.utils.helpers import normalize_data
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer

train_data = {
  "EVENT_TIME": [
    "Die Veranstaltung beginnt am 24.12.2025 16:00",
    "Wann: 03.03.2020 16:00",
    "25.04.2025 - 23.05.2020 , 15:00 - 16:00",
    "22.06.2022 18:00",
    "23.12.2030 17:00 - 18:00",
    "12.05.2024 , 19:00 - 21:00",
    "So. 12.08.2024 12:15 - 13:15",
    "**24.12.2025 16:00 **",
    "15.01.2025 18:00",
    "18.01.2025 11:00 - 18:00",
    "**Der Nikolaus kommt am Freitag 17:00 !**",
    "Sa 12.00 + 16:00",
    "So 12:00",
    "Kindertheater Sa + So 15:00",
    "| Beginn | Freitag 16:00 - 20:00",
    "/ Samstag 11:00 - 20:00",
    "/ Sonntag 11:00 - 18:00",
    "19:00",
    "Beginn: 15:00"
  ],
  "ADMITTANCE_TIME": [
    "Einlass ist 15:00",
    "Einlass ist 15:30",
    "Einlass: 15:00",
    "Abendkasse 14:30",
    "Einlass: 14:30",
    "Tageskasse 15:00 im Museum",
    "Einlass beginnt um 16:00",
    "Der Einlass erfolgt um 17:30",
    "Zutritt ab 18:00",
    "Gäste dürfen ab 14:45 eintreten",
    "Türöffnung: 19:00",
    "Einlass erfolgt ab 20:15",
    "Einlass für VIPs: 13:30",
    "Normaler Einlass: 16:45",
    "Die Türen öffnen sich um 12:00",
    "Zugang ab 17:00 möglich",
    "Einlasskontrolle beginnt um 18:30",
    "Einlass erst ab 19:15 gestattet"
  ]
}


nlp = spacy.blank("de")
nlp.add_pipe('sentencizer')

# 1️⃣ Punkt als Suffix & Infix definieren (damit er zwischen Zahlen trennt)
suffixes = list(nlp.Defaults.suffixes) + [r"\."]  # Punkt als Suffix hinzufügen
infixes = list(nlp.Defaults.infixes) + [r"(?<=\d)\.(?=\d)"] + [r"(?<=\d)\:(?=\d)"]  # Punkt zwischen Zahlen trennen


# Regex-Objekte kompilieren
suffix_re = compile_suffix_regex(suffixes)
infix_re = compile_infix_regex(infixes)

# Angepasste Tokenizer-Funktion setzen
nlp.tokenizer = Tokenizer(nlp.vocab, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
# 2️⃣ Entity Ruler für Datumsangaben hinzufügen
ruler = nlp.add_pipe("entity_ruler")

patterns = [
    {
        "label": "DATE",
        "pattern": [
            {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dd"}, {"ORTH": "."}, {"SHAPE": "dddd"}
        ]
    },
    {
        "label": "TIME",
        "pattern": [
            {"SHAPE": "dd"}, {"ORTH": ":"}, {"SHAPE": "dd"}
        ]
    }
]

ruler.add_patterns(patterns)


# Prepare Training Data: Use Placeholders for Times and Dates
classifier_train_data_cleaned = {"EVENT_TIME": [], "ADMITTANCE_TIME": []}
for text in train_data["EVENT_TIME"]:
    text = normalize_data(text)
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "DATE":
            text = text.replace(ent.text, "[DATE]")
        if ent.label_ == "TIME":
            text = text.replace(ent.text, "[TIME]")
    classifier_train_data_cleaned["EVENT_TIME"].append(text)
for text in train_data["ADMITTANCE_TIME"]:
    text = normalize_data(text)
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "DATE":
            text = text.replace(ent.text, "[DATE]")
        if ent.label_ == "TIME":
            text = text.replace(ent.text, "[TIME]")
    classifier_train_data_cleaned["ADMITTANCE_TIME"].append(text)

# remove duplicates
classifier_train_data_cleaned["EVENT_TIME"] = list(set(classifier_train_data_cleaned["EVENT_TIME"]))
classifier_train_data_cleaned["ADMITTANCE_TIME"] = list(set(classifier_train_data_cleaned["ADMITTANCE_TIME"]))


classifier = ClassyClassifier(data=classifier_train_data_cleaned)
classifier.set_embedding_model(model="stsb-xlm-r-multilingual")
with open("../../playground/models/time_classifier.pkl", "wb") as f:
    pickle.dump(classifier, f)

TEXTS = ["Einlass: ab 15:00", "Beginn: 18:00", "Konzert: 20:00 bis 21:00", "Start: 20:00, Ende: 21:00"]

for text in TEXTS:
    print(text)
    text = normalize_data(text)
    analyzer = MarkdownAnalyzer(text)

    md_elements = analyzer.identify_all().get("block_elements")
    dates = {"dates": [], "times": []}
    for md_element in md_elements:
        doc = nlp(md_element.text)
        # Prüfe Tokenisierung
        # print("Tokens:", [token.text for token in doc])
        if doc.ents:
            print("Found Entities: ", doc.ents)
            modified_text = md_element.text

            # Replace TIME and DATE entities with placeholders
            for ent in doc.ents:
                if ent.label_ == "DATE":
                    modified_text = modified_text.replace(ent.text, "[DATE]")
                if ent.label_ == "TIME":
                    modified_text = modified_text.replace(ent.text, "[TIME]")

            cats = classifier(modified_text)
            time_category = max(cats, key=cats.get)
            print(time_category)
            print("*"*100)