hackerbyhobby commited on
Commit
8edfc45
·
unverified ·
1 Parent(s): 90d7edb

added initial app

Browse files
Files changed (1) hide show
  1. app.py +150 -4
app.py CHANGED
@@ -1,8 +1,154 @@
1
  import gradio as gr
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import pytesseract
3
+ from PIL import Image
4
+ from transformers import pipeline
5
+ import re
6
 
7
+ # 1. Load scam keywords from file
8
+ # Each line in 'scam_keywords.txt' is treated as a separate keyword.
9
+ with open("scam_keywords.txt", "r", encoding="utf-8") as f:
10
+ SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
11
 
12
+ # 2. Zero-Shot Classification Pipeline
13
+ model_name = "joeddav/xlm-roberta-large-xnli"
14
+ classifier = pipeline("zero-shot-classification", model=model_name)
15
 
16
+ CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
17
+
18
+ def keyword_and_url_boost(probabilities, text):
19
+ """
20
+ Adjust final probabilities if certain scam-related keywords or URLs appear.
21
+ - probabilities: dict, label -> original probability
22
+ - text: the combined text from user input + OCR
23
+
24
+ Returns an updated dict of probabilities that sum to 1.
25
+ """
26
+ lower_text = text.lower()
27
+
28
+ # 1. Check scam keywords
29
+ keyword_count = sum(1 for kw in SCAM_KEYWORDS if kw in lower_text)
30
+ keyword_boost = 0.05 * keyword_count # 5% per found keyword
31
+ keyword_boost = min(keyword_boost, 0.30) # cap at +30%
32
+
33
+ # 2. Check if there's any URL (simple regex for http/https)
34
+ found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
35
+ url_boost = 0.0
36
+ if found_urls:
37
+ # For demonstration: a flat +10% if a URL is found
38
+ url_boost = 0.10
39
+
40
+ # 3. Combine total boost
41
+ total_boost = keyword_boost + url_boost
42
+ total_boost = min(total_boost, 0.40) # cap at +40%
43
+
44
+ if total_boost <= 0:
45
+ return probabilities # no change if no keywords/URLs found
46
+
47
+ smishing_prob = probabilities["SMiShing"]
48
+ other_scam_prob = probabilities["Other Scam"]
49
+ legit_prob = probabilities["Legitimate"]
50
+
51
+ # 4. Distribute the total boost equally to "SMiShing" and "Other Scam"
52
+ half_boost = total_boost / 2.0
53
+ smishing_boosted = smishing_prob + half_boost
54
+ other_scam_boosted = other_scam_prob + half_boost
55
+ legit_boosted = legit_prob
56
+
57
+ # 5. Re-normalize so they sum to 1
58
+ total = smishing_boosted + other_scam_boosted + legit_boosted
59
+ if total > 0:
60
+ smishing_final = smishing_boosted / total
61
+ other_scam_final = other_scam_boosted / total
62
+ legit_final = legit_boosted / total
63
+ else:
64
+ smishing_final = 0.0
65
+ other_scam_final = 0.0
66
+ legit_final = 1.0
67
+
68
+ return {
69
+ "SMiShing": smishing_final,
70
+ "Other Scam": other_scam_final,
71
+ "Legitimate": legit_final
72
+ }
73
+
74
+ def smishing_detector(text, image):
75
+ """
76
+ 1. Extract text from the image (OCR) if provided.
77
+ 2. Combine with user-entered text.
78
+ 3. Zero-shot classification -> base probabilities.
79
+ 4. Keyword + URL boost -> adjusted probabilities.
80
+ 5. Return final label, confidence, etc.
81
+ """
82
+ # Step 1: OCR if there's an image
83
+ combined_text = text if text else ""
84
+ if image is not None:
85
+ ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
86
+ combined_text += " " + ocr_text
87
+
88
+ # Clean text
89
+ combined_text = combined_text.strip()
90
+ if not combined_text:
91
+ return {
92
+ "text_used_for_classification": "(none)",
93
+ "label": "No text provided",
94
+ "confidence": 0.0,
95
+ "keywords_found": [],
96
+ "urls_found": []
97
+ }
98
+
99
+ # Step 2: Zero-shot classification
100
+ result = classifier(
101
+ sequences=combined_text,
102
+ candidate_labels=CANDIDATE_LABELS,
103
+ hypothesis_template="This message is {}."
104
+ )
105
+ original_probs = dict(zip(result["labels"], result["scores"]))
106
+
107
+ # Step 3: Keyword + URL boost
108
+ boosted_probs = keyword_and_url_boost(original_probs, combined_text)
109
+
110
+ # Step 4: Pick final label after boost
111
+ final_label = max(boosted_probs, key=boosted_probs.get)
112
+ final_confidence = round(boosted_probs[final_label], 3)
113
+
114
+ # Step 5: Identify which keywords and URLs were found
115
+ lower_text = combined_text.lower()
116
+ found_keywords = [kw for kw in SCAM_KEYWORDS if kw in lower_text]
117
+ found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
118
+
119
+ return {
120
+ "text_used_for_classification": combined_text,
121
+ "original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
122
+ "boosted_probabilities": {k: round(v, 3) for k, v in boosted_probs.items()},
123
+ "label": final_label,
124
+ "confidence": final_confidence,
125
+ "keywords_found": found_keywords,
126
+ "urls_found": found_urls,
127
+ }
128
+
129
+ demo = gr.Interface(
130
+ fn=smishing_detector,
131
+ inputs=[
132
+ gr.Textbox(
133
+ lines=3,
134
+ label="Paste Suspicious SMS Text (English/Spanish)",
135
+ placeholder="Type or paste the message here..."
136
+ ),
137
+ gr.Image(
138
+ type="pil",
139
+ label="Or Upload a Screenshot (Optional)"
140
+ )
141
+ ],
142
+ outputs="json",
143
+ title="SMiShing & Scam Detector (Keyword + URL Boost)",
144
+ description="""
145
+ This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
146
+ (joeddav/xlm-roberta-large-xnli). It also checks for certain "scam keywords" (loaded from a file)
147
+ and any URLs, boosting the probability of a scam label if found.
148
+ Supports English & Spanish text (OCR included).
149
+ """,
150
+ allow_flagging="never"
151
+ )
152
+
153
+ if __name__ == "__main__":
154
+ demo.launch()