import streamlit as st
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import torch
import numpy as np
from collections import Counter
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pkg_resources
import folium
from folium.plugins import HeatMap
import country_converter as coco
from streamlit_folium import folium_static

current_dir = os.path.dirname(os.path.abspath(__file__))
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")

ARABIC_STOP_WORDS = {
    'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
    'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
    'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
    'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
    'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
    'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
    'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
    'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
    'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
    'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
    'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
    'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
    'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
    'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'
    }

st.set_page_config(
    page_title="Arabic Poem Analysis",
    page_icon="📚",
    layout="wide"
)

@st.cache_resource
def load_models():
    """Load and cache the models"""
    # + Added use_fast=True for faster tokenization
    tokenizer = AutoTokenizer.from_pretrained(
        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
        use_fast=True
    )
    
    # + Added torchscript and low_cpu_mem_usage
    bert_model = AutoModel.from_pretrained(
        "aubmindlab/bert-base-arabertv2",
        torchscript=True,
        low_cpu_mem_usage=True
    )
    
    # + Added optimizations for emotion model
    emotion_model = AutoModelForSequenceClassification.from_pretrained(
        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
        torchscript=True,
        low_cpu_mem_usage=True
    )
    
    # ~ Changed pipeline configuration to use batching
    emotion_classifier = pipeline(
        "sentiment-analysis",
        model=emotion_model,
        tokenizer=tokenizer,
        batch_size=32,
        device=-1  # + Added to force CPU usage
    )
    
    return tokenizer, bert_model, emotion_classifier

# + Added new batch processing function
def process_texts_in_batches(texts, batch_size=32):
    """Process texts in batches for better CPU utilization"""
    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
    results = []
    
    for batch in batches:
        batch_results = emotion_classifier(batch, truncation=True, max_length=512)
        results.extend(batch_results)
        
    return results

# + Added caching decorator for embeddings
@st.cache_data
def get_cached_embeddings(text, tokenizer, model):
    """Cache embeddings to avoid recomputation"""
    return get_embedding_for_text(text, tokenizer, model)

def create_theme_map(summaries, topic_model):
    """Create an interactive map showing theme distributions across countries"""
    try:
        # Create a base map centered on the Arab world
        m = folium.Map(location=[25, 45], zoom_start=4)
        
        # Convert country names to coordinates
        cc = coco.CountryConverter()
        
        for summary in summaries:
            try:
                # Get country coordinates
                country_iso = cc.convert(names=[summary['country']], to='ISO2')
                country_data = cc.convert(names=[summary['country']], to='name_short')
                
                # Create popup content with theme information
                popup_content = f"""
                    <h4>{summary['country']}</h4>
                    <b>Top Themes:</b><br>
                    {'<br>'.join([f"• {topic['topic']}: {topic['count']}" 
                                 for topic in summary['top_topics'][:5]])}
                """
                
                # Add marker for each country
                folium.CircleMarker(
                    location=[cc.convert(country_iso, to='latitude')[0],
                             cc.convert(country_iso, to='longitude')[0]],
                    radius=20,
                    popup=folium.Popup(popup_content, max_width=300),
                    color='red',
                    fill=True,
                    fill_opacity=0.7
                ).add_to(m)
            except Exception as e:
                st.warning(f"Could not process {summary['country']}: {str(e)}")
                continue
                
        return m
    except Exception as e:
        st.error(f"Error creating map: {str(e)}")
        return None

def split_text(text, max_length=512):
    """Split text into chunks of maximum token length while preserving word boundaries."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        word_length = len(word.split())
        if current_length + word_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def create_arabic_wordcloud(text, title):
    wordcloud = WordCloud(
        width=1200, 
        height=600,
        background_color='white',
        font_path=font_path,
        max_words=200,
        stopwords=ARABIC_STOP_WORDS
    ).generate(text)
    
    fig, ax = plt.subplots(figsize=(15, 8))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(title, fontsize=16, pad=20)
    return fig

def clean_arabic_text(text):
    """Clean Arabic text by removing stop words and normalizing."""
    words = text.split()
    cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
    return ' '.join(cleaned_words)

def classify_emotion(text, classifier):
    """Classify emotion for complete text with proper token handling."""
    try:
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0
        
        for word in words:
            word_tokens = len(classifier.tokenizer.encode(word))
            if current_length + word_tokens > 512:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_length = word_tokens
            else:
                current_chunk.append(word)
                current_length += word_tokens
        
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        if not chunks:
            chunks = [text]
        
        all_scores = []
        for chunk in chunks:
            try:
                # Direct classification without additional tokenization
                result = classifier(chunk)
                scores = result[0]
                all_scores.append(scores)
            except Exception as chunk_error:
                st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
                continue
        
        if all_scores:
            label_scores = {}
            count = len(all_scores)
            
            for scores in all_scores:
                label = scores['label']
                if label not in label_scores:
                    label_scores[label] = 0
                label_scores[label] += scores['score']
            
            avg_scores = {label: score/count for label, score in label_scores.items()}
            final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
            return final_emotion
        
        return "LABEL_2"
        
    except Exception as e:
        st.warning(f"Error in emotion classification: {str(e)}")
        return "LABEL_2"
        
def get_embedding_for_text(text, tokenizer, model):
    """Get embedding for complete text."""
    chunks = split_text(text)
    chunk_embeddings = []
    
    for chunk in chunks:
        try:
            inputs = tokenizer(
                chunk,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            )
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                # Get the correct output format
                outputs = model(**inputs)[0]  # Access first element of tuple
            
            embedding = outputs[:, 0, :].cpu().numpy()
            chunk_embeddings.append(embedding[0])
        except Exception as e:
            st.warning(f"Error processing chunk: {str(e)}")
            continue
    
    if chunk_embeddings:
        weights = np.array([len(chunk.split()) for chunk in chunks])
        weights = weights / weights.sum()
        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
        return weighted_embedding
    return np.zeros(model.config.hidden_size)

def format_topics(topic_model, topic_counts):
    """Format topics for display."""
    formatted_topics = []
    for topic_num, count in topic_counts:
        if topic_num == -1:
            topic_label = "Miscellaneous"
        else:
            words = topic_model.get_topic(topic_num)
            topic_label = " | ".join([word for word, _ in words[:5]])
        
        formatted_topics.append({
            'topic': topic_label,
            'count': count
        })
    return formatted_topics

def format_emotions(emotion_counts):
    """Format emotions for display."""
    EMOTION_LABELS = {
        'LABEL_0': 'Negative',
        'LABEL_1': 'Positive',
        'LABEL_2': 'Neutral'
    }
    
    formatted_emotions = []
    for label, count in emotion_counts:
        emotion = EMOTION_LABELS.get(label, label)
        formatted_emotions.append({
            'emotion': emotion,
            'count': count
        })
    return formatted_emotions

def get_optimized_topic_model(bert_model):
    """Configure BERTopic for better CPU performance"""
    return BERTopic(
        embedding_model=bert_model,
        language="arabic",
        calculate_probabilities=False,
        verbose=False,
        n_gram_range=(1, 1),
        min_topic_size=5,
        nr_topics="auto",
        low_memory=True
    )

    
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
    """Process the data and generate summaries with flexible topic configuration."""
    summaries = []
    
    topic_model = get_optimized_topic_model(bert_model)
    
    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
                                min_df=1,
                                max_df=1.0)
    topic_model.vectorizer_model = vectorizer
    
    for country, group in df.groupby('country'):
        progress_text = f"Processing poems for {country}..."
        progress_bar = st.progress(0, text=progress_text)
        
        texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
        all_emotions = []
        
        embeddings = []
        for i, text in enumerate(texts):
            try:
                embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
                if embedding is not None and not np.isnan(embedding).any():
                    embeddings.append(embedding)
                else:
                    st.warning(f"Invalid embedding generated for text {i+1} in {country}")
                    continue
            except Exception as e:
                st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
                continue
            progress = (i + 1) / len(texts) * 0.4
            progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
        
        if len(embeddings) != len(texts):
            texts = texts[:len(embeddings)]
        embeddings = np.array(embeddings)
        
        for i, text in enumerate(texts):
            emotion = classify_emotion(text, emotion_classifier)
            all_emotions.append(emotion)
            progress = 0.4 + ((i + 1) / len(texts) * 0.3)
            progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")

        try:
            
            if len(texts) < min_topic_size:
                st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
                continue
                
            
            topics, probs = topic_model.fit_transform(texts, embeddings)
            
            
            topic_counts = Counter(topics)
            
            top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
            top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
            
            summaries.append({
                'country': country,
                'total_poems': len(texts),
                'top_topics': top_topics,
                'top_emotions': top_emotions
            })
            progress_bar.progress(1.0, text="Processing complete!")
            
        except Exception as e:
            st.warning(f"Could not generate topics for {country}: {str(e)}")
            continue

    return summaries, topic_model

try:
    bert_tokenizer, bert_model, emotion_classifier = load_models()
    st.success("Models loaded successfully!")
except Exception as e:
    st.error(f"Error loading models: {str(e)}")
    st.stop()

# Main app interface
st.title("📚 Arabic Poem Analysis")
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")

uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])

if uploaded_file is not None:
    try:
        if uploaded_file.name.endswith('.csv'):
            df = pd.read_csv(uploaded_file)
        else:
            df = pd.read_excel(uploaded_file)
        
        required_columns = ['country', 'poem']
        if not all(col in df.columns for col in required_columns):
            st.error("File must contain 'country' and 'poem' columns.")
            st.stop()
        
        df['country'] = df['country'].str.strip()
        df = df.dropna(subset=['country', 'poem'])
        
        st.subheader("Topic Modeling Settings")
        col1, col2 = st.columns(2)
        
        with col1:
            topic_strategy = st.radio(
                "Topic Number Strategy",
                ["Auto", "Manual"],
                help="Choose whether to let the model determine the optimal number of topics or set it manually"
            )
            
            if topic_strategy == "Manual":
                n_documents = len(df)
                max_topics = 500
                min_topics = 5
                default_topics = 20
                
                n_topics = st.slider(
                    "Number of Topics",
                    min_value=min_topics,
                    max_value=max_topics,
                    value=default_topics,
                    help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
                )
                
                st.info(f"""
                    💡 For your dataset of {n_documents:,} documents:
                    - Available topic range: {min_topics}-{max_topics}
                    - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
                    """)
        
        with col2:
            top_n = st.number_input(
                "Number of top topics/emotions to display:", 
                min_value=1, 
                max_value=100, 
                value=10
            )

        if st.button("Process Data"):
            with st.spinner("Processing your data..."):
                summaries, topic_model = process_and_summarize(
                    df,
                    bert_tokenizer,
                    bert_model,
                    emotion_classifier,
                    top_n=top_n,
                    topic_strategy=topic_strategy,
                    n_topics=n_topics if topic_strategy == "Manual" else None,
                    min_topic_size=3
                )

                if summaries:
                    st.success("Analysis complete!")
                    
                    tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Theme Map"])
                    
                    with tab1:
                        for summary in summaries:
                            with st.expander(f"📍 {summary['country']} ({summary['total_poems']} poems)"):
                                col1, col2 = st.columns(2)
                                
                                with col1:
                                    st.subheader("Top Topics")
                                    for topic in summary['top_topics']:
                                        st.write(f"• {topic['topic']}: {topic['count']} poems")
                                
                                with col2:
                                    st.subheader("Emotions")
                                    for emotion in summary['top_emotions']:
                                        st.write(f"• {emotion['emotion']}: {emotion['count']} poems")

                                st.subheader("Word Cloud Visualization")
                                country_poems = df[df['country'] == summary['country']]['poem']
                                combined_text = ' '.join(country_poems)
                                wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
                                st.pyplot(wordcloud_fig)                                
                    
                    with tab2:
                        st.subheader("Global Topic Distribution")
                        topic_info = topic_model.get_topic_info()
                        for _, row in topic_info.iterrows():
                            if row['Topic'] == -1:
                                topic_name = "Miscellaneous"
                            else:
                                words = topic_model.get_topic(row['Topic'])
                                topic_name = " | ".join([word for word, _ in words[:5]])
                            st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
                    with tab3:
                        st.subheader("Thematic Distribution Map")
                        theme_map = create_theme_map(summaries, topic_model)
                        folium_static(theme_map)
    except Exception as e:
        st.error(f"Error processing file: {str(e)}")

else:
    st.info("👆 Upload a file to get started!")
    
    st.write("### Expected File Format:")
    example_df = pd.DataFrame({
        'country': ['Egypt', 'Palestine'],
        'poem': ['قصيدة مصرية', 'قصيدة فلسطينية']
    })
    st.dataframe(example_df)