import gradio as gr import tensorflow as tf import numpy as np import nltk import pickle from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from tensorflow.keras.preprocessing.sequence import pad_sequences import re from urllib.parse import urlparse # Load the model model = tf.keras.models.load_model('new_phishing_detection_model.keras') # Compile the model with standard loss and metrics model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]) # Preprocessing functions nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') STOPWORDS = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def extract_domain(url): domain = urlparse(url).netloc return domain def normalize_length(text, target_length=50): if len(text) < target_length: text = text + " " * (target_length - len(text)) else: text = text[:target_length] return text def preprocess_url(url): url = url.lower() url = re.sub(r'https?://', '', url) url = re.sub(r'www\.', '', url) domain = extract_domain(url) url = re.sub(domain, '', url) url = re.sub(r'[^a-zA-Z0-9]', ' ', url) url = re.sub(r'\s+', ' ', url).strip() url = normalize_length(url) tokens = word_tokenize(url) tokens = [word for word in tokens if word not in STOPWORDS] tokens = [lemmatizer.lemmatize(word) for word in tokens] return ' '.join(tokens) def preprocess_html(html): html = re.sub(r'<[^>]+>', ' ', html) html = html.lower() html = re.sub(r'https?://', '', html) html = re.sub(r'[^a-zA-Z0-9]', ' ', html) html = re.sub(r'\s+', ' ', html).strip() tokens = word_tokenize(html) tokens = [word for word in tokens if word not in STOPWORDS] tokens = [lemmatizer.lemmatize(word) for word in tokens] return ' '.join(tokens) def extract_features(url): features = {} features['length'] = len(url) features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url)) features['num_digits'] = len(re.findall(r'\d', url)) return features max_url_length = 180 max_html_length = 2000 max_words = 10000 # Load tokenizers with open('url_tokenizer.pkl', 'rb') as f: url_tokenizer = pickle.load(f) with open('html_tokenizer.pkl', 'rb') as f: html_tokenizer = pickle.load(f) def preprocess_input(input_text, tokenizer, max_length): sequences = tokenizer.texts_to_sequences([input_text]) padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post') return padded_sequences def get_prediction(input_text): cleaned_url = preprocess_url(input_text) cleaned_html = preprocess_html(input_text) url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length) html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length) features = extract_features(input_text) features_vector = np.array([[features['length'], features['num_special_chars'], features['num_digits']]]) input_data = [url_data, html_data, features_vector] prediction = model.predict(input_data)[0][0] return prediction def phishing_detection(input_text): prediction = get_prediction(input_text) if prediction > 0.5: return f"Warning: This site is likely a phishing site! ({prediction:.2f})" else: return f"Safe: This site is not likely a phishing site. ({prediction:.2f})" iface = gr.Interface( fn=phishing_detection, inputs=gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"), outputs=gr.components.Textbox(label="Phishing Detection Result"), title="Phishing Detection Model", description="Check if a URL or HTML is Phishing.", theme="default" ) iface.launch()