web-phishing-detection

Sleeping

rmdhirr commited on Jun 16, 2024

Commit

2e97462

verified ·

1 Parent(s): a04dd4c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -62,13 +62,6 @@ def preprocess_html(html):
     tokens = [lemmatizer.lemmatize(word) for word in tokens]
     return ' '.join(tokens)
-def extract_features(url):
-    features = {}
-    features['length'] = len(url)
-    features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url))
-    features['num_digits'] = len(re.findall(r'\d', url))
-    return features
 max_url_length = 180
 max_html_length = 2000
 max_words = 10000
@@ -91,11 +84,10 @@ def get_prediction(input_text):
     url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
     html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
-    features = extract_features(input_text)
-    features_vector = np.array([[features['length'], features['num_special_chars'], features['num_digits']]])
-    input_data = [url_data, html_data, features_vector]
-    prediction = model.predict(input_data)[0][0]
     return prediction
 def phishing_detection(input_text):

     tokens = [lemmatizer.lemmatize(word) for word in tokens]
     return ' '.join(tokens)
 max_url_length = 180
 max_html_length = 2000
 max_words = 10000
     url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
     html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
+    # Combine URL and HTML data by concatenation
+    combined_data = np.concatenate((url_data, html_data), axis=1)
+    prediction = model.predict(combined_data)[0][0]
     return prediction
 def phishing_detection(input_text):