import streamlit as st
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re


nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):

    text = text.lower()
    

    text = ''.join([char for char in text if char not in string.punctuation])
    

    text = re.sub(r'\d+', '', text)
    

    text = ' '.join(text.split())
    

    tokens = word_tokenize(text)
    

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
 
    return ' '.join(tokens)


model = joblib.load('spam_detector_model.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')


st.title("📧 Spam Message Detector")

st.write("""
This app detects whether a message is spam or not. 
Enter your message below and click 'Analyze' to check!
""")

message = st.text_area("Enter your message:", height=100)

if st.button("Analyze"):
    if message:
  
        processed_text = preprocess_text(message)
        

        text_vectorized = vectorizer.transform([processed_text])
        

        prediction = model.predict(text_vectorized)[0]
        probability = model.predict_proba(text_vectorized)[0]
        

        st.markdown("### Analysis Result")
        
        if prediction == 1:
            st.error("🚨 This message is likely SPAM!")
            st.write(f"Confidence: {probability[1]:.2%}")
        else:
            st.success("✅ This message appears to be legitimate.")
            st.write(f"Confidence: {probability[0]:.2%}")
        

        with st.expander("See preprocessing steps"):
            st.write("Original message:", message)
            st.write("Processed message:", processed_text)
    else:
        st.warning("Please enter a message to analyze.")


with st.sidebar:
    st.header("About the Model")
    st.write("""
    This spam detector uses an XGBoost classifier trained on a dataset of spam and legitimate messages.
    
    Model Performance:
    - Training Accuracy: 99.7%
    - Testing Accuracy: 98.9%
    """)