|
import streamlit as st |
|
import joblib |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
import string |
|
import re |
|
|
|
|
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
|
|
def preprocess_text(text): |
|
|
|
text = text.lower() |
|
|
|
|
|
text = ''.join([char for char in text if char not in string.punctuation]) |
|
|
|
|
|
text = re.sub(r'\d+', '', text) |
|
|
|
|
|
text = ' '.join(text.split()) |
|
|
|
|
|
tokens = word_tokenize(text) |
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
tokens = [token for token in tokens if token not in stop_words] |
|
|
|
|
|
return ' '.join(tokens) |
|
|
|
|
|
model = joblib.load('spam_detector_model.joblib') |
|
vectorizer = joblib.load('tfidf_vectorizer.joblib') |
|
|
|
|
|
st.title("π§ Spam Message Detector") |
|
|
|
st.write(""" |
|
This app detects whether a message is spam or not. |
|
Enter your message below and click 'Analyze' to check! |
|
""") |
|
|
|
message = st.text_area("Enter your message:", height=100) |
|
|
|
if st.button("Analyze"): |
|
if message: |
|
|
|
processed_text = preprocess_text(message) |
|
|
|
|
|
text_vectorized = vectorizer.transform([processed_text]) |
|
|
|
|
|
prediction = model.predict(text_vectorized)[0] |
|
probability = model.predict_proba(text_vectorized)[0] |
|
|
|
|
|
st.markdown("### Analysis Result") |
|
|
|
if prediction == 1: |
|
st.error("π¨ This message is likely SPAM!") |
|
st.write(f"Confidence: {probability[1]:.2%}") |
|
else: |
|
st.success("β
This message appears to be legitimate.") |
|
st.write(f"Confidence: {probability[0]:.2%}") |
|
|
|
|
|
with st.expander("See preprocessing steps"): |
|
st.write("Original message:", message) |
|
st.write("Processed message:", processed_text) |
|
else: |
|
st.warning("Please enter a message to analyze.") |
|
|
|
|
|
with st.sidebar: |
|
st.header("About the Model") |
|
st.write(""" |
|
This spam detector uses an XGBoost classifier trained on a dataset of spam and legitimate messages. |
|
|
|
Model Performance: |
|
- Training Accuracy: 99.7% |
|
- Testing Accuracy: 98.9% |
|
""") |
|
|