File size: 5,304 Bytes
90d7edb 8edfc45 90d7edb 8edfc45 90d7edb 8edfc45 90d7edb 8edfc45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import gradio as gr
import pytesseract
from PIL import Image
from transformers import pipeline
import re
# 1. Load scam keywords from file
# Each line in 'scam_keywords.txt' is treated as a separate keyword.
with open("scam_keywords.txt", "r", encoding="utf-8") as f:
SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
# 2. Zero-Shot Classification Pipeline
model_name = "joeddav/xlm-roberta-large-xnli"
classifier = pipeline("zero-shot-classification", model=model_name)
CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
def keyword_and_url_boost(probabilities, text):
"""
Adjust final probabilities if certain scam-related keywords or URLs appear.
- probabilities: dict, label -> original probability
- text: the combined text from user input + OCR
Returns an updated dict of probabilities that sum to 1.
"""
lower_text = text.lower()
# 1. Check scam keywords
keyword_count = sum(1 for kw in SCAM_KEYWORDS if kw in lower_text)
keyword_boost = 0.05 * keyword_count # 5% per found keyword
keyword_boost = min(keyword_boost, 0.30) # cap at +30%
# 2. Check if there's any URL (simple regex for http/https)
found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
url_boost = 0.0
if found_urls:
# For demonstration: a flat +10% if a URL is found
url_boost = 0.10
# 3. Combine total boost
total_boost = keyword_boost + url_boost
total_boost = min(total_boost, 0.40) # cap at +40%
if total_boost <= 0:
return probabilities # no change if no keywords/URLs found
smishing_prob = probabilities["SMiShing"]
other_scam_prob = probabilities["Other Scam"]
legit_prob = probabilities["Legitimate"]
# 4. Distribute the total boost equally to "SMiShing" and "Other Scam"
half_boost = total_boost / 2.0
smishing_boosted = smishing_prob + half_boost
other_scam_boosted = other_scam_prob + half_boost
legit_boosted = legit_prob
# 5. Re-normalize so they sum to 1
total = smishing_boosted + other_scam_boosted + legit_boosted
if total > 0:
smishing_final = smishing_boosted / total
other_scam_final = other_scam_boosted / total
legit_final = legit_boosted / total
else:
smishing_final = 0.0
other_scam_final = 0.0
legit_final = 1.0
return {
"SMiShing": smishing_final,
"Other Scam": other_scam_final,
"Legitimate": legit_final
}
def smishing_detector(text, image):
"""
1. Extract text from the image (OCR) if provided.
2. Combine with user-entered text.
3. Zero-shot classification -> base probabilities.
4. Keyword + URL boost -> adjusted probabilities.
5. Return final label, confidence, etc.
"""
# Step 1: OCR if there's an image
combined_text = text if text else ""
if image is not None:
ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
combined_text += " " + ocr_text
# Clean text
combined_text = combined_text.strip()
if not combined_text:
return {
"text_used_for_classification": "(none)",
"label": "No text provided",
"confidence": 0.0,
"keywords_found": [],
"urls_found": []
}
# Step 2: Zero-shot classification
result = classifier(
sequences=combined_text,
candidate_labels=CANDIDATE_LABELS,
hypothesis_template="This message is {}."
)
original_probs = dict(zip(result["labels"], result["scores"]))
# Step 3: Keyword + URL boost
boosted_probs = keyword_and_url_boost(original_probs, combined_text)
# Step 4: Pick final label after boost
final_label = max(boosted_probs, key=boosted_probs.get)
final_confidence = round(boosted_probs[final_label], 3)
# Step 5: Identify which keywords and URLs were found
lower_text = combined_text.lower()
found_keywords = [kw for kw in SCAM_KEYWORDS if kw in lower_text]
found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
return {
"text_used_for_classification": combined_text,
"original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
"boosted_probabilities": {k: round(v, 3) for k, v in boosted_probs.items()},
"label": final_label,
"confidence": final_confidence,
"keywords_found": found_keywords,
"urls_found": found_urls,
}
demo = gr.Interface(
fn=smishing_detector,
inputs=[
gr.Textbox(
lines=3,
label="Paste Suspicious SMS Text (English/Spanish)",
placeholder="Type or paste the message here..."
),
gr.Image(
type="pil",
label="Or Upload a Screenshot (Optional)"
)
],
outputs="json",
title="SMiShing & Scam Detector (Keyword + URL Boost)",
description="""
This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
(joeddav/xlm-roberta-large-xnli). It also checks for certain "scam keywords" (loaded from a file)
and any URLs, boosting the probability of a scam label if found.
Supports English & Spanish text (OCR included).
""",
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch() |