File size: 5,304 Bytes
90d7edb
8edfc45
 
 
 
90d7edb
8edfc45
 
 
 
90d7edb
8edfc45
 
 
90d7edb
8edfc45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import gradio as gr
import pytesseract
from PIL import Image
from transformers import pipeline
import re

# 1. Load scam keywords from file
#    Each line in 'scam_keywords.txt' is treated as a separate keyword.
with open("scam_keywords.txt", "r", encoding="utf-8") as f:
    SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]

# 2. Zero-Shot Classification Pipeline
model_name = "joeddav/xlm-roberta-large-xnli"
classifier = pipeline("zero-shot-classification", model=model_name)

CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]

def keyword_and_url_boost(probabilities, text):
    """
    Adjust final probabilities if certain scam-related keywords or URLs appear.
      - probabilities: dict, label -> original probability
      - text: the combined text from user input + OCR

    Returns an updated dict of probabilities that sum to 1.
    """
    lower_text = text.lower()

    # 1. Check scam keywords
    keyword_count = sum(1 for kw in SCAM_KEYWORDS if kw in lower_text)
    keyword_boost = 0.05 * keyword_count  # 5% per found keyword
    keyword_boost = min(keyword_boost, 0.30)  # cap at +30%

    # 2. Check if there's any URL (simple regex for http/https)
    found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
    url_boost = 0.0
    if found_urls:
        # For demonstration: a flat +10% if a URL is found
        url_boost = 0.10

    # 3. Combine total boost
    total_boost = keyword_boost + url_boost
    total_boost = min(total_boost, 0.40)  # cap at +40%

    if total_boost <= 0:
        return probabilities  # no change if no keywords/URLs found

    smishing_prob = probabilities["SMiShing"]
    other_scam_prob = probabilities["Other Scam"]
    legit_prob = probabilities["Legitimate"]

    # 4. Distribute the total boost equally to "SMiShing" and "Other Scam"
    half_boost = total_boost / 2.0
    smishing_boosted = smishing_prob + half_boost
    other_scam_boosted = other_scam_prob + half_boost
    legit_boosted = legit_prob

    # 5. Re-normalize so they sum to 1
    total = smishing_boosted + other_scam_boosted + legit_boosted
    if total > 0:
        smishing_final = smishing_boosted / total
        other_scam_final = other_scam_boosted / total
        legit_final = legit_boosted / total
    else:
        smishing_final = 0.0
        other_scam_final = 0.0
        legit_final = 1.0

    return {
        "SMiShing": smishing_final,
        "Other Scam": other_scam_final,
        "Legitimate": legit_final
    }

def smishing_detector(text, image):
    """
    1. Extract text from the image (OCR) if provided.
    2. Combine with user-entered text.
    3. Zero-shot classification -> base probabilities.
    4. Keyword + URL boost -> adjusted probabilities.
    5. Return final label, confidence, etc.
    """
    # Step 1: OCR if there's an image
    combined_text = text if text else ""
    if image is not None:
        ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
        combined_text += " " + ocr_text

    # Clean text
    combined_text = combined_text.strip()
    if not combined_text:
        return {
            "text_used_for_classification": "(none)",
            "label": "No text provided",
            "confidence": 0.0,
            "keywords_found": [],
            "urls_found": []
        }

    # Step 2: Zero-shot classification
    result = classifier(
        sequences=combined_text,
        candidate_labels=CANDIDATE_LABELS,
        hypothesis_template="This message is {}."
    )
    original_probs = dict(zip(result["labels"], result["scores"]))

    # Step 3: Keyword + URL boost
    boosted_probs = keyword_and_url_boost(original_probs, combined_text)

    # Step 4: Pick final label after boost
    final_label = max(boosted_probs, key=boosted_probs.get)
    final_confidence = round(boosted_probs[final_label], 3)

    # Step 5: Identify which keywords and URLs were found
    lower_text = combined_text.lower()
    found_keywords = [kw for kw in SCAM_KEYWORDS if kw in lower_text]
    found_urls = re.findall(r"(https?://[^\s]+)", lower_text)

    return {
        "text_used_for_classification": combined_text,
        "original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
        "boosted_probabilities": {k: round(v, 3) for k, v in boosted_probs.items()},
        "label": final_label,
        "confidence": final_confidence,
        "keywords_found": found_keywords,
        "urls_found": found_urls,
    }

demo = gr.Interface(
    fn=smishing_detector,
    inputs=[
        gr.Textbox(
            lines=3,
            label="Paste Suspicious SMS Text (English/Spanish)",
            placeholder="Type or paste the message here..."
        ),
        gr.Image(
            type="pil",
            label="Or Upload a Screenshot (Optional)"
        )
    ],
    outputs="json",
    title="SMiShing & Scam Detector (Keyword + URL Boost)",
    description="""
This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
(joeddav/xlm-roberta-large-xnli). It also checks for certain "scam keywords" (loaded from a file)
and any URLs, boosting the probability of a scam label if found.
Supports English & Spanish text (OCR included).
""",
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()