hackerbyhobby commited on
Commit
04508b8
·
unverified ·
1 Parent(s): 1eb6c81

rollback due to error

Browse files
Files changed (1) hide show
  1. app.py +77 -202
app.py CHANGED
@@ -5,22 +5,6 @@ from transformers import pipeline
5
  import re
6
  from langdetect import detect
7
  from deep_translator import GoogleTranslator
8
- import shap
9
- import requests
10
- import json
11
- import os
12
- import numpy as np
13
- from shap.maskers import Text
14
-
15
- # Patch SHAP to replace np.bool with np.bool_ dynamically
16
- if hasattr(shap.maskers._text.Text, "invariants"):
17
- original_invariants = shap.maskers._text.Text.invariants
18
-
19
- def patched_invariants(self, *args):
20
- # Use np.bool_ instead of the deprecated np.bool
21
- return np.zeros(len(self._tokenized_s), dtype=np.bool_)
22
-
23
- shap.maskers._text.Text.invariants = patched_invariants
24
 
25
  # Translator instance
26
  translator = GoogleTranslator(source="auto", target="es")
@@ -37,58 +21,15 @@ model_name = "joeddav/xlm-roberta-large-xnli"
37
  classifier = pipeline("zero-shot-classification", model=model_name)
38
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
39
 
40
- # 3. SHAP Explainer Setup
41
- explainer = shap.Explainer(classifier, masker=Text(tokenizer=classifier.tokenizer))
42
-
43
- # Retrieve the Google Safe Browsing API key from the environment
44
- SAFE_BROWSING_API_KEY = os.getenv("SAFE_BROWSING_API_KEY")
45
-
46
- if not SAFE_BROWSING_API_KEY:
47
- raise ValueError("Google Safe Browsing API key not found. Please set it as an environment variable in your Hugging Face Space.")
48
-
49
- SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
50
-
51
- def check_url_with_google_safebrowsing(url):
52
- """
53
- Check a URL against Google's Safe Browsing API.
54
- """
55
- payload = {
56
- "client": {
57
- "clientId": "your-client-id",
58
- "clientVersion": "1.0"
59
- },
60
- "threatInfo": {
61
- "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE", "POTENTIALLY_HARMFUL_APPLICATION"],
62
- "platformTypes": ["ANY_PLATFORM"],
63
- "threatEntryTypes": ["URL"],
64
- "threatEntries": [
65
- {"url": url}
66
- ]
67
- }
68
- }
69
- try:
70
- response = requests.post(
71
- SAFE_BROWSING_URL,
72
- params={"key": SAFE_BROWSING_API_KEY},
73
- json=payload
74
- )
75
- response_data = response.json()
76
- if "matches" in response_data:
77
- return True # URL is flagged as malicious
78
- return False # URL is safe
79
- except Exception as e:
80
- print(f"Error checking URL with Safe Browsing API: {e}")
81
- return False
82
-
83
  def get_keywords_by_language(text: str):
84
  """
85
  Detect language using `langdetect` and translate keywords if needed.
86
  """
87
- snippet = text[:200] # Use a snippet for detection
88
  try:
89
  detected_lang = detect(snippet)
90
  except Exception:
91
- detected_lang = "en" # Default to English if detection fails
92
 
93
  if detected_lang == "es":
94
  smishing_in_spanish = [
@@ -126,10 +67,12 @@ def boost_probabilities(probabilities: dict, text: str):
126
  p_other_scam += other_scam_boost
127
  p_legit -= (smishing_boost + other_scam_boost)
128
 
 
129
  p_smishing = max(p_smishing, 0.0)
130
  p_other_scam = max(p_other_scam, 0.0)
131
  p_legit = max(p_legit, 0.0)
132
 
 
133
  total = p_smishing + p_other_scam + p_legit
134
  if total > 0:
135
  p_smishing /= total
@@ -142,104 +85,19 @@ def boost_probabilities(probabilities: dict, text: str):
142
  "SMiShing": p_smishing,
143
  "Other Scam": p_other_scam,
144
  "Legitimate": p_legit,
145
- "detected_lang": detected_lang,
146
  }
147
 
148
- def explain_classification(text):
149
- """
150
- Generate SHAP explanations for the classification.
151
  """
152
- if not text.strip():
153
- raise ValueError("Cannot generate SHAP explanations for empty text.")
154
-
155
- shap_values = explainer([text])
156
- shap.force_plot(
157
- explainer.expected_value[0], shap_values[0].values[0], shap_values[0].data
158
- )
159
-
160
- def generate_user_friendly_message(
161
- final_label: str,
162
- confidence: float,
163
- found_smishing: list,
164
- found_other_scam: list,
165
- found_urls: list,
166
- threat_analysis: dict
167
- ) -> str:
168
  """
169
- Build a user-friendly explanation of the classification and provide
170
- a brief reason why it is labeled as SMiShing, Other Scam, or Legitimate.
171
- """
172
- if final_label == "SMiShing":
173
- msg = (
174
- f"This message is classified as SMiShing (confidence {confidence}). "
175
- "We found indications typical of phishing via SMS, such as "
176
- )
177
- reasons = []
178
- if found_smishing:
179
- reasons.append(f"the use of suspicious keywords: {', '.join(found_smishing)}")
180
- if found_urls:
181
- flagged_urls = [u for u in found_urls if threat_analysis.get(u)]
182
- safe_urls = [u for u in found_urls if not threat_analysis.get(u)]
183
- if flagged_urls:
184
- reasons.append(f"at least one URL flagged as unsafe: {', '.join(flagged_urls)}")
185
- if safe_urls:
186
- reasons.append(f"other URLs may be suspicious: {', '.join(safe_urls)}")
187
-
188
- if not reasons:
189
- reasons.append("certain context or structure commonly used in SMiShing")
190
-
191
- msg += " and ".join(reasons) + "."
192
- return msg
193
-
194
- elif final_label == "Other Scam":
195
- msg = (
196
- f"This message is classified as 'Other Scam' (confidence {confidence}). "
197
- "It contains elements typically associated with scams. "
198
- )
199
- reasons = []
200
- if found_other_scam:
201
- reasons.append(f"keywords often linked to fraudulent activity: {', '.join(found_other_scam)}")
202
- if found_urls:
203
- flagged_urls = [u for u in found_urls if threat_analysis.get(u)]
204
- safe_urls = [u for u in found_urls if not threat_analysis.get(u)]
205
- if flagged_urls:
206
- reasons.append(f"URLs flagged as unsafe: {', '.join(flagged_urls)}")
207
- if safe_urls:
208
- reasons.append(f"additional suspicious URLs: {', '.join(safe_urls)}")
209
-
210
- if not reasons:
211
- reasons.append("general content or structure known to be used in scams")
212
-
213
- msg += " and ".join(reasons) + "."
214
- return msg
215
-
216
- else: # Legitimate
217
- msg = (
218
- f"This message is classified as 'Legitimate' (confidence {confidence}). "
219
- "We did not detect typical phishing or scam indicators. "
220
- )
221
- if found_urls:
222
- # If there are URLs, mention if they're considered safe
223
- flagged_urls = [u for u in found_urls if threat_analysis.get(u)]
224
- if flagged_urls:
225
- msg += f"However, note that at least one URL appears unsafe: {', '.join(flagged_urls)}."
226
- else:
227
- msg += "Although it contains URLs, none appear to be malicious."
228
- else:
229
- msg += "No suspicious keywords or URLs were detected."
230
-
231
- return msg
232
-
233
-
234
- def smishing_detector(text, image):
235
- """
236
- Main detection function combining text and OCR.
237
- """
238
- combined_text = text or ""
239
- if image is not None:
240
- ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
241
- combined_text += " " + ocr_text
242
- combined_text = combined_text.strip()
243
 
244
  if not combined_text:
245
  return {
@@ -247,9 +105,7 @@ def smishing_detector(text, image):
247
  "label": "No text provided",
248
  "confidence": 0.0,
249
  "keywords_found": [],
250
- "urls_found": [],
251
- "threat_analysis": "No URLs to analyze",
252
- "user_friendly_message": "No classification could be made since no text was provided.",
253
  }
254
 
255
  result = classifier(
@@ -258,12 +114,18 @@ def smishing_detector(text, image):
258
  hypothesis_template="This message is {}."
259
  )
260
  original_probs = {k: float(v) for k, v in zip(result["labels"], result["scores"])}
 
261
  boosted = boost_probabilities(original_probs, combined_text)
262
 
263
- # Extract language key first, then remove
264
- detected_lang = boosted.pop("detected_lang", "en")
 
 
 
 
265
  for k, v in boosted.items():
266
  boosted[k] = float(v)
 
267
 
268
  final_label = max(boosted, key=boosted.get)
269
  final_confidence = round(boosted[final_label], 3)
@@ -275,24 +137,6 @@ def smishing_detector(text, image):
275
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
276
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
277
 
278
- # Analyze URLs using Google's Safe Browsing API
279
- threat_analysis = {
280
- url: check_url_with_google_safebrowsing(url) for url in found_urls
281
- }
282
-
283
- # Generate SHAP Explanation (optional for user insights)
284
- explain_classification(combined_text)
285
-
286
- # Build user-friendly message
287
- user_friendly_msg = generate_user_friendly_message(
288
- final_label,
289
- final_confidence,
290
- found_smishing,
291
- found_other_scam,
292
- found_urls,
293
- threat_analysis
294
- )
295
-
296
  return {
297
  "detected_language": detected_lang,
298
  "text_used_for_classification": combined_text,
@@ -303,33 +147,64 @@ def smishing_detector(text, image):
303
  "smishing_keywords_found": found_smishing,
304
  "other_scam_keywords_found": found_other_scam,
305
  "urls_found": found_urls,
306
- "threat_analysis": threat_analysis,
307
- # The new user-friendly explanation
308
- "user_friendly_message": user_friendly_msg,
309
  }
310
 
311
- demo = gr.Interface(
312
- fn=smishing_detector,
313
- inputs=[
314
- gr.Textbox(
315
- lines=3,
316
- label="Paste Suspicious SMS Text (English/Spanish)",
317
- placeholder="Type or paste the message here..."
318
- ),
319
- gr.Image(
320
- type="pil",
321
- label="Or Upload a Screenshot (Optional)"
 
 
 
 
 
 
 
 
 
 
 
 
322
  )
323
- ],
324
- outputs="json",
325
- title="SMiShing & Scam Detector with Safe Browsing",
326
- description="""
327
- This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
328
- (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
329
- It uses SHAP for explainability and checks URLs against Google's Safe Browsing API for enhanced analysis.
330
- """,
331
- flagging_mode="never"
332
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
  if __name__ == "__main__":
335
  demo.launch()
 
5
  import re
6
  from langdetect import detect
7
  from deep_translator import GoogleTranslator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Translator instance
10
  translator = GoogleTranslator(source="auto", target="es")
 
21
  classifier = pipeline("zero-shot-classification", model=model_name)
22
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def get_keywords_by_language(text: str):
25
  """
26
  Detect language using `langdetect` and translate keywords if needed.
27
  """
28
+ snippet = text[:200]
29
  try:
30
  detected_lang = detect(snippet)
31
  except Exception:
32
+ detected_lang = "en"
33
 
34
  if detected_lang == "es":
35
  smishing_in_spanish = [
 
67
  p_other_scam += other_scam_boost
68
  p_legit -= (smishing_boost + other_scam_boost)
69
 
70
+ # Clamp
71
  p_smishing = max(p_smishing, 0.0)
72
  p_other_scam = max(p_other_scam, 0.0)
73
  p_legit = max(p_legit, 0.0)
74
 
75
+ # Re-normalize
76
  total = p_smishing + p_other_scam + p_legit
77
  if total > 0:
78
  p_smishing /= total
 
85
  "SMiShing": p_smishing,
86
  "Other Scam": p_other_scam,
87
  "Legitimate": p_legit,
88
+ "detected_lang": detected_lang
89
  }
90
 
91
+ def smishing_detector(input_type, text, image):
 
 
92
  """
93
+ Main detection function combining text (if 'Text') and OCR (if 'Screenshot').
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  """
95
+ if input_type == "Text":
96
+ combined_text = text.strip() if text else ""
97
+ else:
98
+ combined_text = ""
99
+ if image is not None:
100
+ combined_text = pytesseract.image_to_string(image, lang="spa+eng").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  if not combined_text:
103
  return {
 
105
  "label": "No text provided",
106
  "confidence": 0.0,
107
  "keywords_found": [],
108
+ "urls_found": []
 
 
109
  }
110
 
111
  result = classifier(
 
114
  hypothesis_template="This message is {}."
115
  )
116
  original_probs = {k: float(v) for k, v in zip(result["labels"], result["scores"])}
117
+
118
  boosted = boost_probabilities(original_probs, combined_text)
119
 
120
+ # Patched snippet begins
121
+ # 1. Extract language first, preserving it
122
+ detected_lang = boosted.get("detected_lang", "en")
123
+ # 2. Remove it so only numeric keys remain
124
+ boosted.pop("detected_lang", None)
125
+ # 3. Convert numeric values to float
126
  for k, v in boosted.items():
127
  boosted[k] = float(v)
128
+ # Patched snippet ends
129
 
130
  final_label = max(boosted, key=boosted.get)
131
  final_confidence = round(boosted[final_label], 3)
 
137
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
138
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  return {
141
  "detected_language": detected_lang,
142
  "text_used_for_classification": combined_text,
 
147
  "smishing_keywords_found": found_smishing,
148
  "other_scam_keywords_found": found_other_scam,
149
  "urls_found": found_urls,
 
 
 
150
  }
151
 
152
+ #
153
+ # Gradio interface with dynamic visibility
154
+ #
155
+ def toggle_inputs(choice):
156
+ """
157
+ Return updates for (text_input, image_input) based on the radio selection.
158
+ """
159
+ if choice == "Text":
160
+ # Show text input, hide image
161
+ return gr.update(visible=True), gr.update(visible=False)
162
+ else:
163
+ # choice == "Screenshot"
164
+ # Hide text input, show image
165
+ return gr.update(visible=False), gr.update(visible=True)
166
+
167
+ with gr.Blocks() as demo:
168
+ gr.Markdown("## SMiShing & Scam Detector (Choose Text or Screenshot)")
169
+
170
+ with gr.Row():
171
+ input_type = gr.Radio(
172
+ choices=["Text", "Screenshot"],
173
+ value="Text",
174
+ label="Choose Input Type"
175
  )
176
+
177
+ text_input = gr.Textbox(
178
+ lines=3,
179
+ label="Paste Suspicious SMS Text",
180
+ placeholder="Type or paste the message here...",
181
+ visible=True # default
182
+ )
183
+
184
+ image_input = gr.Image(
185
+ type="pil",
186
+ label="Upload Screenshot",
187
+ visible=False # hidden by default
188
+ )
189
+
190
+ # Whenever input_type changes, toggle which input is visible
191
+ input_type.change(
192
+ fn=toggle_inputs,
193
+ inputs=input_type,
194
+ outputs=[text_input, image_input],
195
+ queue=False
196
+ )
197
+
198
+ # Button to run classification
199
+ analyze_btn = gr.Button("Classify")
200
+ output_json = gr.JSON(label="Result")
201
+
202
+ # On button click, call the smishing_detector
203
+ analyze_btn.click(
204
+ fn=smishing_detector,
205
+ inputs=[input_type, text_input, image_input],
206
+ outputs=output_json
207
+ )
208
 
209
  if __name__ == "__main__":
210
  demo.launch()