rmdhirr commited on
Commit
102a386
·
verified ·
1 Parent(s): a664f59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -0
app.py CHANGED
@@ -8,6 +8,7 @@ from nltk.tokenize import word_tokenize
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
10
  import re
 
11
 
12
  # Load the model
13
  model = tf.keras.models.load_model('new_phishing_detection_model.keras')
@@ -25,6 +26,10 @@ nltk.download('wordnet')
25
  STOPWORDS = set(stopwords.words('english'))
26
  lemmatizer = WordNetLemmatizer()
27
 
 
 
 
 
28
  def normalize_length(text, target_length=50):
29
  if len(text) < target_length:
30
  text = text + " " * (target_length - len(text))
@@ -36,6 +41,8 @@ def preprocess_url(url):
36
  url = url.lower()
37
  url = re.sub(r'https?://', '', url)
38
  url = re.sub(r'www\.', '', url)
 
 
39
  url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
40
  url = re.sub(r'\s+', ' ', url).strip()
41
  url = normalize_length(url)
 
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
10
  import re
11
+ from urllib.parse import urlparse
12
 
13
  # Load the model
14
  model = tf.keras.models.load_model('new_phishing_detection_model.keras')
 
26
  STOPWORDS = set(stopwords.words('english'))
27
  lemmatizer = WordNetLemmatizer()
28
 
29
+ def extract_domain(url):
30
+ domain = urlparse(url).netloc
31
+ return domain
32
+
33
  def normalize_length(text, target_length=50):
34
  if len(text) < target_length:
35
  text = text + " " * (target_length - len(text))
 
41
  url = url.lower()
42
  url = re.sub(r'https?://', '', url)
43
  url = re.sub(r'www\.', '', url)
44
+ domain = extract_domain(url)
45
+ url = re.sub(domain, '', url)
46
  url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
47
  url = re.sub(r'\s+', ' ', url).strip()
48
  url = normalize_length(url)