import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from urllib.parse import urlparse

# Load the model
model = tf.keras.models.load_model('new_phishing_detection_model.keras')

# Compile the model with standard loss and metrics
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Preprocessing functions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def extract_domain(url):
    domain = urlparse(url).netloc
    return domain

def normalize_length(text, target_length=50):
    if len(text) < target_length:
        text = text + " " * (target_length - len(text))
    else:
        text = text[:target_length]
    return text

def preprocess_url(url):
    url = url.lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'www\.', '', url)
    domain = extract_domain(url)
    url = re.sub(domain, '', url)
    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
    url = re.sub(r'\s+', ' ', url).strip()
    url = normalize_length(url)
    tokens = word_tokenize(url)
    tokens = [word for word in tokens if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def preprocess_html(html):
    html = re.sub(r'<[^>]+>', ' ', html)
    html = html.lower()
    html = re.sub(r'https?://', '', html)
    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
    html = re.sub(r'\s+', ' ', html).strip()
    tokens = word_tokenize(html)
    tokens = [word for word in tokens if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def extract_features(url):
    features = {}
    features['length'] = len(url)
    features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url))
    features['num_digits'] = len(re.findall(r'\d', url))
    return features

max_url_length = 180
max_html_length = 2000
max_words = 10000

# Load tokenizers
with open('url_tokenizer.pkl', 'rb') as f:
    url_tokenizer = pickle.load(f)
with open('html_tokenizer.pkl', 'rb') as f:
    html_tokenizer = pickle.load(f)

def preprocess_input(input_text, tokenizer, max_length):
    sequences = tokenizer.texts_to_sequences([input_text])
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    return padded_sequences

def get_prediction(input_text):
    cleaned_url = preprocess_url(input_text)
    cleaned_html = preprocess_html(input_text)
    
    url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
    html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
    
    features = extract_features(input_text)
    features_vector = np.array([[features['length'], features['num_special_chars'], features['num_digits']]])
    
    input_data = [url_data, html_data, features_vector]
    prediction = model.predict(input_data)[0][0]
    return prediction

def phishing_detection(input_text):
    prediction = get_prediction(input_text)
    if prediction > 0.5:
        return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
    else:
        return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"

iface = gr.Interface(
    fn=phishing_detection,
    inputs=gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
    outputs=gr.components.Textbox(label="Phishing Detection Result"),
    title="Phishing Detection Model",
    description="Check if a URL or HTML is Phishing.",
    theme="default"
)

iface.launch()