# -*- coding: utf-8 -*-

# --- Prerequisites ---
# Ensure these are in your requirements.txt for Hugging Face Spaces:
# spacy==3.5.0 # Or the version used to train NER model
# streamlit>=1.0.0
# transformers>=4.20.0
# torch>=1.10.0 # Or tensorflow
# sentencepiece>=0.1.90
# protobuf==3.20.3
# peft>=0.5.0 # Parameter-Efficient Fine-Tuning library
# accelerate>=0.26.0
# numpy
# nltk # For ROUGE metric calculation during fine-tuning (needed for postprocess_text if kept)
# bitsandbytes # If using 8-bit optimizer

import streamlit as st
import spacy
from pathlib import Path
import sys
import torch
import warnings
import re
import numpy as np

try:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    from peft import PeftModel
    import nltk
    nltk.download('punkt', quiet=True) # Ensure punkt tokenizer is available for potential NLTK use
    print("✔ Successfully imported core libraries.")
except ImportError as e:
    # Display error in the Streamlit app itself if imports fail during runtime
    st.error(f"Error importing libraries: {e}. Please check requirements.txt and ensure all packages are installed.")
    st.stop() # Stop execution if libraries are missing


# --- Configuration ---
# Use paths relative to this app.py script
NER_MODEL_PATH = Path("./training_400") # Assumes model-best folder is at the repo root
BASE_SUMMARIZATION_MODEL = "csebuetnlp/mT5_multilingual_XLSum"
ADAPTER_PATH = Path("./mt5_finetuned_tamil_summary") # Path to your fine-tuned adapters

# Device Selection
DEVICE = "cpu" # Default to CPU for broader compatibility on free tiers
if torch.cuda.is_available():
    print("INFO: CUDA device detected. Setting DEVICE to 'cuda'.")
    DEVICE = "cuda"
else:
    print("INFO: No CUDA device detected. Using CPU.")

# Summarization parameters
SUMM_NUM_BEAMS = 4
MIN_LEN_PERC = 0.30
MAX_LEN_PERC = 0.70
ABS_MIN_TOKEN_LEN = 30
ABS_MAX_TOKEN_LEN = 512
# --- End Configuration ---

# --- Suppress Warnings ---
warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
warnings.filterwarnings("ignore", message=".*The sentencepiece tokenizer that you are converting.*")

# --- Global Variables & Model Loading Control ---
ner_model_global = None
summ_tokenizer_global = None
summ_model_global = None
models_loaded_status = "Not Loaded" # More descriptive status

# --- Model Loading with Streamlit Caching ---
@st.cache_resource # Loads only once per browser session
def load_ner_model_cached(path):
    """Loads the spaCy NER model."""
    global models_loaded_status
    models_loaded_status = f"Loading NER model from: {path}..."
    st.info(models_loaded_status)
    if not path.exists():
        st.error(f"NER Model directory not found at {path.resolve()}")
        models_loaded_status = "Error: NER Model Not Found"
        return None
    try:
        nlp = spacy.load(path)
        # Add sentencizer if needed (crucial for sentence splitting later)
        if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"):
             component_to_add_before = "ner" if "ner" in nlp.pipe_names else "tok2vec" if "tok2vec" in nlp.pipe_names else None
             if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before)
             else: nlp.add_pipe("sentencizer", first=True)
             print("INFO: Added 'sentencizer' to NER pipeline.")
        print(f"✔ NER model loaded from: {path}")
        return nlp
    except Exception as e:
        st.error(f"Error loading NER model: {e}")
        models_loaded_status = f"Error Loading NER Model: {e}"
        return None

@st.cache_resource # Loads only once per browser session
def load_summarizer_cached(base_model_name, adapter_path, device):
    """Loads the Hugging Face base model and applies PEFT adapter."""
    global models_loaded_status
    models_loaded_status = f"Loading Summarizer (Base: {base_model_name}, Adapter: {adapter_path})..."
    st.info(models_loaded_status)
    try:
        print(f"┣ Loading base tokenizer: {base_model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)

        print(f"┣ Loading base model: {base_model_name}...")
        base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)

        print(f"┣ Loading PEFT adapter from: {adapter_path}...")
        if not adapter_path.exists():
             st.error(f"✘ FATAL: PEFT Adapter directory not found at {adapter_path.resolve()}. Using BASE model only.")
             model = base_model # Fallback to base model
        else:
             model = PeftModel.from_pretrained(base_model, adapter_path)
             print(f"✔ Successfully loaded PEFT adapter.")

        print(f"┣ Moving summarization model to {device}...")
        model.to(device)
        model.eval() # Set to evaluation mode
        print(f"✔ Summarization model loaded on {device}.")
        return tokenizer, model
    except Exception as e:
        st.error(f"Error loading summarization model: {e}")
        print(f"✘ FATAL: Error loading summarization model: {e}")
        import traceback
        traceback.print_exc()
        models_loaded_status = f"Error Loading Summarizer: {e}"
        return None, None

# --- Helper Functions ---
def summarize_text_internal(tokenizer, model, text, device, num_beams=SUMM_NUM_BEAMS,
                            min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
    """Internal function to generate summary."""
    if not text or text.isspace(): return "[Error: Input text is empty]"
    # Ensure models are loaded before proceeding
    if not tokenizer or not model: return "[Error: Summarization model not ready]"
    print("INFO: Generating summary (percentage lengths)...")
    try:
        # Calculate lengths
        with tokenizer.as_target_tokenizer():
            input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
        input_token_count = input_ids.shape[1]
        if input_token_count == 0: return "[Error: Input tokenized to zero tokens]"
        min_len_tokens = max(ABS_MIN_TOKEN_LEN, int(input_token_count * min_length_perc))
        max_len_tokens = max(min_len_tokens + 10, int(input_token_count * max_length_perc))
        max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
        min_len_tokens = min(min_len_tokens, max_len_tokens)
        print(f"INFO: Target summary tokens: min={min_len_tokens}, max={max_len_tokens}")

        # Tokenize for input
        inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(device)

        # Generate
        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=inputs['input_ids'],
                num_beams=num_beams,
                max_length=max_len_tokens,
                min_length=min_len_tokens,
                early_stopping=True
            )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        print("✔ Summary generation complete.")
        return summary
    except Exception as e:
        st.error(f"Error during summary generation: {e}")
        print(f"✘ Error during summary generation: {e}")
        import traceback
        traceback.print_exc()
        return f"[Error generating summary: {e}]"

def extract_entities_internal(ner_nlp, text):
    """Extracts entities and formats them as a markdown string."""
    if not text or text.isspace(): return [], "- No input text -"
    if ner_nlp is None: return [], "[Error: NER model not loaded]"
    print("INFO: Extracting entities...")
    try:
        doc = ner_nlp(text)
        entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()})
        print(f"✔ Extracted {len(entities)} unique entities.")
        if entities:
            # Format as Markdown list
            entity_list_str = "\n".join([f"- **{lbl}:** {txt}" for txt, lbl in sorted(entities, key=lambda x: x[1])]) # Sort by label
        else:
            entity_list_str = "(No entities found by NER model)"
        return entities, entity_list_str
    except Exception as e:
        st.error(f"Error during entity extraction: {e}")
        print(f"✘ Error during entity extraction: {e}")
        return [], "[Error extracting entities]"

def create_prompted_input_internal(text, entities):
    """Creates input string with unique entities prepended."""
    if not entities: return text
    if not isinstance(text, str): return "[Invalid Input Text]"
    unique_entity_texts = sorted(list({ent[0] for ent in entities if ent[0]}))
    entity_string = ", ".join(unique_entity_texts)
    separator = ". முக்கிய சொற்கள்: "
    prompted_text = f"{entity_string}{separator}{text}"
    print(f"INFO: Created prompted input with {len(unique_entity_texts)} unique entities.")
    return prompted_text

# --- Streamlit App Layout ---
st.set_page_config(layout="wide", page_title="Tamil NER Summarizer", page_icon="✍️")

st.title("தமிழ் செய்தி சுருக்கம் மற்றும் NER ஒருங்கிணைப்பு")
st.markdown("*(Tamil News Summarization with NER Integration)*")
st.markdown("---")

# --- Load Models ---
# Trigger loading models using the cached functions
# Assign to global variables if loading is successful
ner_model_global = load_ner_model_cached(NER_MODEL_PATH)
summ_tokenizer_global, summ_model_global = load_summarizer_cached(BASE_SUMMARIZATION_MODEL, ADAPTER_PATH, DEVICE)

# Check if models loaded successfully before proceeding
models_ready = ner_model_global is not None and summ_tokenizer_global is not None and summ_model_global is not None
if not models_ready:
    st.error("One or more essential models failed to load. Please check the application logs (terminal/HF Spaces logs) for details. The app cannot function.")
    st.stop() # Stop the app if models aren't ready
else:
    st.sidebar.success(f"Models loaded successfully on {DEVICE.upper()}!")
    st.sidebar.markdown(f"**NER Model:** `{NER_MODEL_PATH.name}`")
    st.sidebar.markdown(f"**Summarizer:** `{BASE_SUMMARIZATION_MODEL}` + Adapter")


# --- Input Area ---
st.header(" உள்ளீடு / Input")
input_text = st.text_area("உங்கள் தமிழ் உரையை இங்கே ஒட்டவும் (Paste your Tamil text here):", height=300, key="input_text_area")

# --- Processing Trigger ---
if st.button("சுருக்கம் & NER ஐ உருவாக்குக (Generate Summary & NER)", key="generate_button"):
    if input_text and not input_text.isspace():
        text_to_process = input_text.strip()
        st.markdown("---")
        st.header(" முடிவுகள் / Results")

        # Use columns for the final output
        col1, col2 = st.columns(2)

        # --- Column 1: NER Entities ---
        with col1:
            st.subheader("முக்கிய சொற்கள் (NER Entities)")
            with st.spinner("Extracting entities..."):
                extracted_entities_raw, entities_display_string = extract_entities_internal(ner_model_global, text_to_process)
            # Display entities using markdown for copyability
            st.markdown(entities_display_string)

        # --- Column 2: NER-Influenced Summary ---
        with col2:
            st.subheader("NER-உடன் செல்வாக்கு பெற்ற சுருக்கம்")
            st.markdown("*(NER-Influenced Summary)*")
            with st.spinner(f"Generating summary on {DEVICE}... (This may take time)"):
                # Create prompted input using the extracted entities
                prompted_input_text = create_prompted_input_internal(text_to_process, extracted_entities_raw)
                # Generate the summary
                ner_influenced_summary = summarize_text_internal(
                    summ_tokenizer_global, summ_model_global, prompted_input_text, DEVICE
                )
            # Display summary using markdown for copyability
            st.markdown(ner_influenced_summary)
            st.caption("Summary generated using fine-tuned model with NER entities prepended to input.")

        st.success("Processing complete!")

    elif input_text is None or input_text.isspace():
         st.warning("Please enter some text into the input area.")
    # Handle the case where button hasn't been pressed yet explicitly
    # else:
    #      st.info("Click the button to generate summaries and extract entities.")


st.markdown("---")
st.caption("Developed using Streamlit, spaCy, and Hugging Face Transformers/PEFT.")