Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ from nltk.tokenize import word_tokenize
|
|
8 |
from nltk.stem import WordNetLemmatizer
|
9 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
10 |
import re
|
|
|
11 |
|
12 |
# Load the model
|
13 |
model = tf.keras.models.load_model('new_phishing_detection_model.keras')
|
@@ -25,6 +26,10 @@ nltk.download('wordnet')
|
|
25 |
STOPWORDS = set(stopwords.words('english'))
|
26 |
lemmatizer = WordNetLemmatizer()
|
27 |
|
|
|
|
|
|
|
|
|
28 |
def normalize_length(text, target_length=50):
|
29 |
if len(text) < target_length:
|
30 |
text = text + " " * (target_length - len(text))
|
@@ -36,6 +41,8 @@ def preprocess_url(url):
|
|
36 |
url = url.lower()
|
37 |
url = re.sub(r'https?://', '', url)
|
38 |
url = re.sub(r'www\.', '', url)
|
|
|
|
|
39 |
url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
|
40 |
url = re.sub(r'\s+', ' ', url).strip()
|
41 |
url = normalize_length(url)
|
|
|
8 |
from nltk.stem import WordNetLemmatizer
|
9 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
10 |
import re
|
11 |
+
from urllib.parse import urlparse
|
12 |
|
13 |
# Load the model
|
14 |
model = tf.keras.models.load_model('new_phishing_detection_model.keras')
|
|
|
26 |
STOPWORDS = set(stopwords.words('english'))
|
27 |
lemmatizer = WordNetLemmatizer()
|
28 |
|
29 |
+
def extract_domain(url):
|
30 |
+
domain = urlparse(url).netloc
|
31 |
+
return domain
|
32 |
+
|
33 |
def normalize_length(text, target_length=50):
|
34 |
if len(text) < target_length:
|
35 |
text = text + " " * (target_length - len(text))
|
|
|
41 |
url = url.lower()
|
42 |
url = re.sub(r'https?://', '', url)
|
43 |
url = re.sub(r'www\.', '', url)
|
44 |
+
domain = extract_domain(url)
|
45 |
+
url = re.sub(domain, '', url)
|
46 |
url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
|
47 |
url = re.sub(r'\s+', ' ', url).strip()
|
48 |
url = normalize_length(url)
|