rmdhirr's picture
Update app.py
bccb3f8 verified
raw
history blame
4.06 kB
import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from urllib.parse import urlparse
# Load the model
model = tf.keras.models.load_model('new_phishing_detection_model.keras')
# Compile the model with standard loss and metrics
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
loss='binary_crossentropy',
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
# Preprocessing functions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def extract_domain(url):
domain = urlparse(url).netloc
return domain
def normalize_length(text, target_length=50):
if len(text) < target_length:
text = text + " " * (target_length - len(text))
else:
text = text[:target_length]
return text
def preprocess_url(url):
url = url.lower()
url = re.sub(r'https?://', '', url)
url = re.sub(r'www\.', '', url)
domain = extract_domain(url)
url = re.sub(domain, '', url)
url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
url = re.sub(r'\s+', ' ', url).strip()
url = normalize_length(url)
tokens = word_tokenize(url)
tokens = [word for word in tokens if word not in STOPWORDS]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
def preprocess_html(html):
html = re.sub(r'<[^>]+>', ' ', html)
html = html.lower()
html = re.sub(r'https?://', '', html)
html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
html = re.sub(r'\s+', ' ', html).strip()
tokens = word_tokenize(html)
tokens = [word for word in tokens if word not in STOPWORDS]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
max_url_length = 180
max_html_length = 2000
max_words = 10000
# Load tokenizers
with open('url_tokenizer.pkl', 'rb') as f:
url_tokenizer = pickle.load(f)
with open('html_tokenizer.pkl', 'rb') as f:
html_tokenizer = pickle.load(f)
def preprocess_input(input_text, tokenizer, max_length):
sequences = tokenizer.texts_to_sequences([input_text])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
return padded_sequences
def get_prediction(input_text, is_html):
if not is_html:
cleaned_text = preprocess_url(input_text)
input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
else:
cleaned_text = preprocess_html(input_text)
input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
prediction = model.predict(input_data)[0][0]
return prediction
# List of known safe domains to help prevent false positives
safe_domains = ['perplexity.ai', 'google.com', 'wikipedia.org']
def phishing_detection(input_text):
domain = extract_domain(input_text)
if domain in safe_domains:
return f"Safe: This site is a known safe domain. (Domain: {domain})"
is_html = bool(re.search(r'<[^>]+>', input_text))
prediction = get_prediction(input_text, is_html)
if prediction > 0.5:
return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
else:
return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"
iface = gr.Interface(
fn=phishing_detection,
inputs=gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
outputs=gr.components.Textbox(label="Phishing Detection Result"),
title="Phishing Detection Model",
description="Check if a URL or HTML is Phishing.",
theme="default"
)
iface.launch()