File size: 3,936 Bytes
eb30cad
 
 
b1ddb38
8cd35aa
b1ddb38
 
 
 
 
102a386
eb30cad
2f8164c
 
eb30cad
2f8164c
eb30cad
2f8164c
eb30cad
 
b1ddb38
 
 
 
 
 
 
 
102a386
 
 
 
a664f59
 
 
e2e2b90
a04dd4c
a664f59
e2e2b90
b1ddb38
 
 
 
102a386
 
b1ddb38
 
e2e2b90
b1ddb38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a04dd4c
 
 
 
 
 
 
b1ddb38
 
 
 
8cd35aa
 
 
 
 
b1ddb38
eb30cad
 
b1ddb38
eb30cad
 
65ec533
 
 
 
 
 
eb30cad
a04dd4c
 
 
 
3a6bb00
eb30cad
 
bccb3f8
65ec533
a664f59
8cd35aa
eb30cad
8cd35aa
eb30cad
 
120d185
bccb3f8
120d185
2f8164c
120d185
eb30cad
 
 
a664f59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from urllib.parse import urlparse

# Load the model
model = tf.keras.models.load_model('new_phishing_detection_model.keras')

# Compile the model with standard loss and metrics
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Preprocessing functions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def extract_domain(url):
    domain = urlparse(url).netloc
    return domain

def normalize_length(text, target_length=50):
    if len(text) < target_length:
        text = text + " " * (target_length - len(text))
    else:
        text = text[:target_length]
    return text

def preprocess_url(url):
    url = url.lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'www\.', '', url)
    domain = extract_domain(url)
    url = re.sub(domain, '', url)
    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
    url = re.sub(r'\s+', ' ', url).strip()
    url = normalize_length(url)
    tokens = word_tokenize(url)
    tokens = [word for word in tokens if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def preprocess_html(html):
    html = re.sub(r'<[^>]+>', ' ', html)
    html = html.lower()
    html = re.sub(r'https?://', '', html)
    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
    html = re.sub(r'\s+', ' ', html).strip()
    tokens = word_tokenize(html)
    tokens = [word for word in tokens if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def extract_features(url):
    features = {}
    features['length'] = len(url)
    features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url))
    features['num_digits'] = len(re.findall(r'\d', url))
    return features

max_url_length = 180
max_html_length = 2000
max_words = 10000

# Load tokenizers
with open('url_tokenizer.pkl', 'rb') as f:
    url_tokenizer = pickle.load(f)
with open('html_tokenizer.pkl', 'rb') as f:
    html_tokenizer = pickle.load(f)

def preprocess_input(input_text, tokenizer, max_length):
    sequences = tokenizer.texts_to_sequences([input_text])
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    return padded_sequences

def get_prediction(input_text):
    cleaned_url = preprocess_url(input_text)
    cleaned_html = preprocess_html(input_text)
    
    url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
    html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
    
    features = extract_features(input_text)
    features_vector = np.array([[features['length'], features['num_special_chars'], features['num_digits']]])
    
    input_data = [url_data, html_data, features_vector]
    prediction = model.predict(input_data)[0][0]
    return prediction

def phishing_detection(input_text):
    prediction = get_prediction(input_text)
    if prediction > 0.5:
        return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
    else:
        return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"

iface = gr.Interface(
    fn=phishing_detection,
    inputs=gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
    outputs=gr.components.Textbox(label="Phishing Detection Result"),
    title="Phishing Detection Model",
    description="Check if a URL or HTML is Phishing.",
    theme="default"
)

iface.launch()