web-phishing-detection

Sleeping

App Files Files Community

web-phishing-detection / app.py

rmdhirr

Update app.py

bccb3f8 verified 11 months ago

raw

history blame

4.06 kB

	import gradio as gr
	import tensorflow as tf
	import numpy as np
	import nltk
	import pickle
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import re
	from urllib.parse import urlparse

	# Load the model
	model = tf.keras.models.load_model('new_phishing_detection_model.keras')

	# Compile the model with standard loss and metrics
	model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
	loss='binary_crossentropy',
	metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

	# Preprocessing functions
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')

	STOPWORDS = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	def extract_domain(url):
	domain = urlparse(url).netloc
	return domain

	def normalize_length(text, target_length=50):
	if len(text) < target_length:
	text = text + " " * (target_length - len(text))
	else:
	text = text[:target_length]
	return text

	def preprocess_url(url):
	url = url.lower()
	url = re.sub(r'https?://', '', url)
	url = re.sub(r'www\.', '', url)
	domain = extract_domain(url)
	url = re.sub(domain, '', url)
	url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
	url = re.sub(r'\s+', ' ', url).strip()
	url = normalize_length(url)
	tokens = word_tokenize(url)
	tokens = [word for word in tokens if word not in STOPWORDS]
	tokens = [lemmatizer.lemmatize(word) for word in tokens]
	return ' '.join(tokens)

	def preprocess_html(html):
	html = re.sub(r'<[^>]+>', ' ', html)
	html = html.lower()
	html = re.sub(r'https?://', '', html)
	html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
	html = re.sub(r'\s+', ' ', html).strip()
	tokens = word_tokenize(html)
	tokens = [word for word in tokens if word not in STOPWORDS]
	tokens = [lemmatizer.lemmatize(word) for word in tokens]
	return ' '.join(tokens)

	max_url_length = 180
	max_html_length = 2000
	max_words = 10000

	# Load tokenizers
	with open('url_tokenizer.pkl', 'rb') as f:
	url_tokenizer = pickle.load(f)
	with open('html_tokenizer.pkl', 'rb') as f:
	html_tokenizer = pickle.load(f)

	def preprocess_input(input_text, tokenizer, max_length):
	sequences = tokenizer.texts_to_sequences([input_text])
	padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
	return padded_sequences

	def get_prediction(input_text, is_html):
	if not is_html:
	cleaned_text = preprocess_url(input_text)
	input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
	input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
	else:
	cleaned_text = preprocess_html(input_text)
	input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
	input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input

	prediction = model.predict(input_data)[0][0]
	return prediction

	# List of known safe domains to help prevent false positives
	safe_domains = ['perplexity.ai', 'google.com', 'wikipedia.org']

	def phishing_detection(input_text):
	domain = extract_domain(input_text)
	if domain in safe_domains:
	return f"Safe: This site is a known safe domain. (Domain: {domain})"

	is_html = bool(re.search(r'<[^>]+>', input_text))
	prediction = get_prediction(input_text, is_html)

	if prediction > 0.5:
	return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
	else:
	return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"

	iface = gr.Interface(
	fn=phishing_detection,
	inputs=gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
	outputs=gr.components.Textbox(label="Phishing Detection Result"),
	title="Phishing Detection Model",
	description="Check if a URL or HTML is Phishing.",
	theme="default"
	)

	iface.launch()