web-phishing-detection

Sleeping

rmdhirr commited on Jun 16, 2024

Commit

a04dd4c

verified ·

1 Parent(s): 65ec533

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ def normalize_length(text, target_length=50):
     if len(text) < target_length:
         text = text + " " * (target_length - len(text))
     else:
-        text = text[: target_length]
     return text
 def preprocess_url(url):
@@ -62,6 +62,13 @@ def preprocess_html(html):
     tokens = [lemmatizer.lemmatize(word) for word in tokens]
     return ' '.join(tokens)
 max_url_length = 180
 max_html_length = 2000
 max_words = 10000
@@ -84,7 +91,10 @@ def get_prediction(input_text):
     url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
     html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
-    input_data = [url_data, html_data]
     prediction = model.predict(input_data)[0][0]
     return prediction

     if len(text) < target_length:
         text = text + " " * (target_length - len(text))
     else:
+        text = text[:target_length]
     return text
 def preprocess_url(url):
     tokens = [lemmatizer.lemmatize(word) for word in tokens]
     return ' '.join(tokens)
+def extract_features(url):
+    features = {}
+    features['length'] = len(url)
+    features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url))
+    features['num_digits'] = len(re.findall(r'\d', url))
+    return features
 max_url_length = 180
 max_html_length = 2000
 max_words = 10000
     url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
     html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
+    features = extract_features(input_text)
+    features_vector = np.array([[features['length'], features['num_special_chars'], features['num_digits']]])
+    input_data = [url_data, html_data, features_vector]
     prediction = model.predict(input_data)[0][0]
     return prediction