web-phishing-detection

Sleeping

App Files Files Community

rmdhirr commited on Jun 16, 2024

Commit

65ec533

verified ·

1 Parent(s): bccb3f8

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -20

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ def normalize_length(text, target_length=50):
     if len(text) < target_length:
         text = text + " " * (target_length - len(text))
     else:
-        text = text[:target_length]
     return text
 def preprocess_url(url):
@@ -77,30 +77,19 @@ def preprocess_input(input_text, tokenizer, max_length):
     padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
     return padded_sequences
-def get_prediction(input_text, is_html):
-    if not is_html:
-        cleaned_text = preprocess_url(input_text)
-        input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
-        input_data = [input_data, np.zeros((1, max_html_length))]  # dummy HTML input
-    else:
-        cleaned_text = preprocess_html(input_text)
-        input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
-        input_data = [np.zeros((1, max_url_length)), input_data]  # dummy URL input
     prediction = model.predict(input_data)[0][0]
     return prediction
-# List of known safe domains to help prevent false positives
-safe_domains = ['perplexity.ai', 'google.com', 'wikipedia.org']
 def phishing_detection(input_text):
-    domain = extract_domain(input_text)
-    if domain in safe_domains:
-        return f"Safe: This site is a known safe domain. (Domain: {domain})"
-    is_html = bool(re.search(r'<[^>]+>', input_text))
-    prediction = get_prediction(input_text, is_html)
     if prediction > 0.5:
         return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
     else:

     if len(text) < target_length:
         text = text + " " * (target_length - len(text))
     else:
+        text = text[: target_length]
     return text
 def preprocess_url(url):
     padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
     return padded_sequences
+def get_prediction(input_text):
+    cleaned_url = preprocess_url(input_text)
+    cleaned_html = preprocess_html(input_text)
+    url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
+    html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
+    input_data = [url_data, html_data]
     prediction = model.predict(input_data)[0][0]
     return prediction
 def phishing_detection(input_text):
+    prediction = get_prediction(input_text)
     if prediction > 0.5:
         return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
     else: