hackerbyhobby commited on
Commit
d2285c1
·
unverified ·
1 Parent(s): a63ad64

updated requirements

Browse files
Files changed (1) hide show
  1. app.py +18 -69
app.py CHANGED
@@ -3,12 +3,11 @@ import pytesseract
3
  from PIL import Image
4
  from transformers import pipeline
5
  import re
6
-
7
- # Language detection & translation
8
  from langdetect import detect
9
- from googletrans import Translator
10
 
11
- translator = Translator()
 
12
 
13
  # 1. Load separate keywords for SMiShing and Other Scam (assumed in English)
14
  with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
@@ -24,26 +23,23 @@ CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
24
 
25
  def get_keywords_by_language(text: str):
26
  """
27
- 1. Detect language (using `langdetect`).
28
- 2. If Spanish ('es'), translate each English-based keyword to Spanish using googletrans.
29
- 3. If English (or anything else), just use the original English lists.
30
  """
31
- # Attempt to detect language from a snippet (to reduce overhead on very large text)
32
- snippet = text[:200] # up to 200 chars for detection
33
  try:
34
  detected_lang = detect(snippet)
35
- except:
36
- detected_lang = "en" # fallback if detection fails
37
 
38
  if detected_lang == "es":
39
  # Translate all SMiShing and Other Scam keywords to Spanish
40
  smishing_in_spanish = [
41
- translator.translate(kw, src="en", dest="es").text.lower()
42
- for kw in SMISHING_KEYWORDS
43
  ]
44
  other_scam_in_spanish = [
45
- translator.translate(kw, src="en", dest="es").text.lower()
46
- for kw in OTHER_SCAM_KEYWORDS
47
  ]
48
  return smishing_in_spanish, other_scam_in_spanish, "es"
49
  else:
@@ -52,45 +48,29 @@ def get_keywords_by_language(text: str):
52
 
53
  def boost_probabilities(probabilities: dict, text: str):
54
  """
55
- 1. Load the appropriate keyword lists (English or Spanish).
56
- 2. Count matches for SMiShing vs. Other Scam.
57
- 3. If a URL is found, add an extra boost only to SMiShing.
58
- 4. Subtract total boost from 'Legitimate'.
59
- 5. Clamp negative probabilities to 0, re-normalize.
60
  """
61
  lower_text = text.lower()
62
-
63
- # Grab the correct keyword lists based on language
64
  smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
65
 
66
- # Count SMiShing keyword matches
67
  smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
68
- # Count Other Scam keyword matches
69
  other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
70
 
71
- # Base boost amounts
72
  smishing_boost = 0.30 * smishing_count
73
  other_scam_boost = 0.30 * other_scam_count
74
 
75
- # Check for URLs => +0.35 only to SMiShing
76
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
77
  if found_urls:
78
  smishing_boost += 0.35
79
 
80
- # Extract original probabilities
81
  p_smishing = probabilities["SMiShing"]
82
  p_other_scam = probabilities["Other Scam"]
83
  p_legit = probabilities["Legitimate"]
84
 
85
- # Apply boosts
86
  p_smishing += smishing_boost
87
  p_other_scam += other_scam_boost
 
88
 
89
- # Subtract total boost from 'Legitimate'
90
- total_boost = smishing_boost + other_scam_boost
91
- p_legit -= total_boost
92
-
93
- # Clamp negative probabilities
94
  if p_smishing < 0:
95
  p_smishing = 0.0
96
  if p_other_scam < 0:
@@ -98,14 +78,12 @@ def boost_probabilities(probabilities: dict, text: str):
98
  if p_legit < 0:
99
  p_legit = 0.0
100
 
101
- # Re-normalize
102
  total = p_smishing + p_other_scam + p_legit
103
  if total > 0:
104
  p_smishing /= total
105
  p_other_scam /= total
106
  p_legit /= total
107
  else:
108
- # fallback if everything is 0
109
  p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
110
 
111
  return {
@@ -116,13 +94,6 @@ def boost_probabilities(probabilities: dict, text: str):
116
  }
117
 
118
  def smishing_detector(text, image):
119
- """
120
- Main function called by Gradio.
121
- 1. Combine user text + OCR text (if an image is provided).
122
- 2. Zero-shot classify => base probabilities.
123
- 3. Apply language detection & translation if needed, then boost logic.
124
- 4. Return final classification.
125
- """
126
  combined_text = text or ""
127
  if image is not None:
128
  ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
@@ -138,51 +109,29 @@ def smishing_detector(text, image):
138
  "urls_found": []
139
  }
140
 
141
- # 1. Zero-shot classification
142
  result = classifier(
143
  sequences=combined_text,
144
  candidate_labels=CANDIDATE_LABELS,
145
  hypothesis_template="This message is {}."
146
  )
147
  original_probs = dict(zip(result["labels"], result["scores"]))
148
-
149
- # 2. Boost logic (including language detection + translation)
150
  boosted = boost_probabilities(original_probs, combined_text)
151
- final_label = max(boosted, key=boosted.get) if not isinstance(boosted.get("detected_lang"), float) else "Legitimate"
152
- # to avoid conflict, let's store the detected language separately:
153
- detected_lang = boosted.pop("detected_lang", "en")
154
-
155
- # We have p_smishing, p_other_scam, p_legit left in boosted
156
  final_label = max(boosted, key=boosted.get)
157
  final_confidence = round(boosted[final_label], 3)
 
158
 
159
- # 3. Identify which keywords & URLs we found
160
  lower_text = combined_text.lower()
161
- # If we detected Spanish, we used the translated keywords to do matching. But let's also show them:
162
- # For demonstration, let's just show the "English or Spanish" keywords. The code to show them in output
163
- # can be the same as before, or you can do a second pass with the same logic from boost_probabilities.
164
- found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
165
-
166
- # We'll do a quick second pass on actual matched keywords so user sees them
167
- # - If language is es => we used translated Spanish keywords, let's do the same for display
168
- # - If language is en => we used the original English lists
169
- if detected_lang == "es":
170
- smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
171
- else:
172
- smishing_keys, scam_keys, _ = (SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en")
173
 
 
174
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
175
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
176
 
177
  return {
178
  "detected_language": detected_lang,
179
  "text_used_for_classification": combined_text,
180
- "original_probabilities": {
181
- k: round(v, 3) for k, v in original_probs.items()
182
- },
183
- "boosted_probabilities": {
184
- k: round(v, 3) for k, v in boosted.items()
185
- },
186
  "label": final_label,
187
  "confidence": final_confidence,
188
  "smishing_keywords_found": found_smishing,
 
3
  from PIL import Image
4
  from transformers import pipeline
5
  import re
 
 
6
  from langdetect import detect
7
+ from deep_translator import GoogleTranslator
8
 
9
+ # Translator instance
10
+ translator = GoogleTranslator(source="auto", target="es")
11
 
12
  # 1. Load separate keywords for SMiShing and Other Scam (assumed in English)
13
  with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
 
23
 
24
  def get_keywords_by_language(text: str):
25
  """
26
+ 1. Detect language using `langdetect`.
27
+ 2. If Spanish ('es'), translate each English-based keyword to Spanish using `deep-translator`.
28
+ 3. If English (or other languages), use the original English lists.
29
  """
30
+ snippet = text[:200] # Use a snippet for detection
 
31
  try:
32
  detected_lang = detect(snippet)
33
+ except Exception:
34
+ detected_lang = "en" # Default to English if detection fails
35
 
36
  if detected_lang == "es":
37
  # Translate all SMiShing and Other Scam keywords to Spanish
38
  smishing_in_spanish = [
39
+ translator.translate(kw).lower() for kw in SMISHING_KEYWORDS
 
40
  ]
41
  other_scam_in_spanish = [
42
+ translator.translate(kw).lower() for kw in OTHER_SCAM_KEYWORDS
 
43
  ]
44
  return smishing_in_spanish, other_scam_in_spanish, "es"
45
  else:
 
48
 
49
  def boost_probabilities(probabilities: dict, text: str):
50
  """
51
+ Boost probabilities based on keyword matches and presence of URLs.
 
 
 
 
52
  """
53
  lower_text = text.lower()
 
 
54
  smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
55
 
 
56
  smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
 
57
  other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
58
 
 
59
  smishing_boost = 0.30 * smishing_count
60
  other_scam_boost = 0.30 * other_scam_count
61
 
 
62
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
63
  if found_urls:
64
  smishing_boost += 0.35
65
 
 
66
  p_smishing = probabilities["SMiShing"]
67
  p_other_scam = probabilities["Other Scam"]
68
  p_legit = probabilities["Legitimate"]
69
 
 
70
  p_smishing += smishing_boost
71
  p_other_scam += other_scam_boost
72
+ p_legit -= (smishing_boost + other_scam_boost)
73
 
 
 
 
 
 
74
  if p_smishing < 0:
75
  p_smishing = 0.0
76
  if p_other_scam < 0:
 
78
  if p_legit < 0:
79
  p_legit = 0.0
80
 
 
81
  total = p_smishing + p_other_scam + p_legit
82
  if total > 0:
83
  p_smishing /= total
84
  p_other_scam /= total
85
  p_legit /= total
86
  else:
 
87
  p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
88
 
89
  return {
 
94
  }
95
 
96
  def smishing_detector(text, image):
 
 
 
 
 
 
 
97
  combined_text = text or ""
98
  if image is not None:
99
  ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
 
109
  "urls_found": []
110
  }
111
 
 
112
  result = classifier(
113
  sequences=combined_text,
114
  candidate_labels=CANDIDATE_LABELS,
115
  hypothesis_template="This message is {}."
116
  )
117
  original_probs = dict(zip(result["labels"], result["scores"]))
 
 
118
  boosted = boost_probabilities(original_probs, combined_text)
 
 
 
 
 
119
  final_label = max(boosted, key=boosted.get)
120
  final_confidence = round(boosted[final_label], 3)
121
+ detected_lang = boosted.pop("detected_lang", "en")
122
 
 
123
  lower_text = combined_text.lower()
124
+ smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
127
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
128
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
129
 
130
  return {
131
  "detected_language": detected_lang,
132
  "text_used_for_classification": combined_text,
133
+ "original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
134
+ "boosted_probabilities": {k: round(v, 3) for k, v in boosted.items()},
 
 
 
 
135
  "label": final_label,
136
  "confidence": final_confidence,
137
  "smishing_keywords_found": found_smishing,