hackerbyhobby commited on
Commit
a1f36a7
·
unverified ·
1 Parent(s): 9cacd96

more updates

Browse files
Files changed (3) hide show
  1. app.py +74 -7
  2. requirements.txt.good1 +10 -0
  3. scam_keywords.txt +0 -15
app.py CHANGED
@@ -5,6 +5,9 @@ from transformers import pipeline
5
  import re
6
  from langdetect import detect
7
  from deep_translator import GoogleTranslator
 
 
 
8
 
9
  # Translator instance
10
  translator = GoogleTranslator(source="auto", target="es")
@@ -21,6 +24,52 @@ model_name = "joeddav/xlm-roberta-large-xnli"
21
  classifier = pipeline("zero-shot-classification", model=model_name)
22
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def get_keywords_by_language(text: str):
25
  """
26
  Detect language using `langdetect` and translate keywords if needed.
@@ -83,9 +132,18 @@ def boost_probabilities(probabilities: dict, text: str):
83
  "SMiShing": p_smishing,
84
  "Other Scam": p_other_scam,
85
  "Legitimate": p_legit,
86
- "detected_lang": detected_lang
87
  }
88
 
 
 
 
 
 
 
 
 
 
89
  def smishing_detector(text, image):
90
  """
91
  Main detection function combining text and OCR.
@@ -102,7 +160,8 @@ def smishing_detector(text, image):
102
  "label": "No text provided",
103
  "confidence": 0.0,
104
  "keywords_found": [],
105
- "urls_found": []
 
106
  }
107
 
108
  result = classifier(
@@ -125,6 +184,14 @@ def smishing_detector(text, image):
125
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
126
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
127
 
 
 
 
 
 
 
 
 
128
  return {
129
  "detected_language": detected_lang,
130
  "text_used_for_classification": combined_text,
@@ -135,6 +202,7 @@ def smishing_detector(text, image):
135
  "smishing_keywords_found": found_smishing,
136
  "other_scam_keywords_found": found_other_scam,
137
  "urls_found": found_urls,
 
138
  }
139
 
140
  demo = gr.Interface(
@@ -151,15 +219,14 @@ demo = gr.Interface(
151
  )
152
  ],
153
  outputs="json",
154
- title="SMiShing & Scam Detector (Language Detection + Keyword Translation)",
155
  description="""
156
  This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
157
  (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
158
- If Spanish, it translates the English-based keyword lists to Spanish before boosting the scores.
159
- Any URL found further boosts SMiShing specifically.
160
- """,
161
  allow_flagging="never"
162
  )
163
 
164
  if __name__ == "__main__":
165
- demo.launch()
 
5
  import re
6
  from langdetect import detect
7
  from deep_translator import GoogleTranslator
8
+ import shap
9
+ import requests
10
+ import json
11
 
12
  # Translator instance
13
  translator = GoogleTranslator(source="auto", target="es")
 
24
  classifier = pipeline("zero-shot-classification", model=model_name)
25
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
26
 
27
+ # SHAP explainer setup
28
+ explainer = shap.Explainer(classifier)
29
+
30
+ # Prompt the user for their Google Safe Browsing API key
31
+ def get_api_key():
32
+ """Prompt the user for their API key."""
33
+ api_key = input("Please enter your Google Safe Browsing API key: ").strip()
34
+ if not api_key:
35
+ raise ValueError("API key is required to use the application.")
36
+ return api_key
37
+
38
+ SAFE_BROWSING_API_KEY = get_api_key()
39
+ SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
40
+
41
+ def check_url_with_google_safebrowsing(url):
42
+ """
43
+ Check a URL against Google's Safe Browsing API.
44
+ """
45
+ payload = {
46
+ "client": {
47
+ "clientId": "your-client-id",
48
+ "clientVersion": "1.0"
49
+ },
50
+ "threatInfo": {
51
+ "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE", "POTENTIALLY_HARMFUL_APPLICATION"],
52
+ "platformTypes": ["ANY_PLATFORM"],
53
+ "threatEntryTypes": ["URL"],
54
+ "threatEntries": [
55
+ {"url": url}
56
+ ]
57
+ }
58
+ }
59
+ try:
60
+ response = requests.post(
61
+ SAFE_BROWSING_URL,
62
+ params={"key": SAFE_BROWSING_API_KEY},
63
+ json=payload
64
+ )
65
+ response_data = response.json()
66
+ if "matches" in response_data:
67
+ return True # URL is flagged as malicious
68
+ return False # URL is safe
69
+ except Exception as e:
70
+ print(f"Error checking URL with Safe Browsing API: {e}")
71
+ return False
72
+
73
  def get_keywords_by_language(text: str):
74
  """
75
  Detect language using `langdetect` and translate keywords if needed.
 
132
  "SMiShing": p_smishing,
133
  "Other Scam": p_other_scam,
134
  "Legitimate": p_legit,
135
+ "detected_lang": detected_lang,
136
  }
137
 
138
+ def explain_classification(text):
139
+ """
140
+ Generate SHAP explanations for the classification.
141
+ """
142
+ shap_values = explainer([text])
143
+ shap.force_plot(
144
+ explainer.expected_value[0], shap_values[0].values[0], shap_values[0].data
145
+ )
146
+
147
  def smishing_detector(text, image):
148
  """
149
  Main detection function combining text and OCR.
 
160
  "label": "No text provided",
161
  "confidence": 0.0,
162
  "keywords_found": [],
163
+ "urls_found": [],
164
+ "threat_analysis": "No URLs to analyze",
165
  }
166
 
167
  result = classifier(
 
184
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
185
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
186
 
187
+ # Analyze URLs using Google's Safe Browsing API
188
+ threat_analysis = {
189
+ url: check_url_with_google_safebrowsing(url) for url in found_urls
190
+ }
191
+
192
+ # SHAP Explanation (optional for user insights)
193
+ explain_classification(combined_text)
194
+
195
  return {
196
  "detected_language": detected_lang,
197
  "text_used_for_classification": combined_text,
 
202
  "smishing_keywords_found": found_smishing,
203
  "other_scam_keywords_found": found_other_scam,
204
  "urls_found": found_urls,
205
+ "threat_analysis": threat_analysis,
206
  }
207
 
208
  demo = gr.Interface(
 
219
  )
220
  ],
221
  outputs="json",
222
+ title="SMiShing & Scam Detector with Safe Browsing",
223
  description="""
224
  This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
225
  (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
226
+ It uses SHAP for explainability and checks URLs against Google's Safe Browsing API for enhanced analysis.
227
+ """,
 
228
  allow_flagging="never"
229
  )
230
 
231
  if __name__ == "__main__":
232
+ demo.launch()
requirements.txt.good1 ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.36.0
2
+ transformers==4.35.0
3
+ torch==2.0.1
4
+ pillow==9.5.0
5
+ pytesseract==0.3.10
6
+ langdetect==1.0.9
7
+ deep-translator==1.10.1
8
+ httpx==0.13.3
9
+ sentencepiece==0.1.99
10
+ numpy==1.25.0
scam_keywords.txt DELETED
@@ -1,15 +0,0 @@
1
- ceo
2
- cash
3
- claim
4
- gift
5
- urgent
6
- prize
7
- password
8
- bank
9
- lottery
10
- loan
11
- winner
12
- congratulations
13
- credit
14
- account
15
- verify