dejanseo commited on
Commit
8582a8e
·
verified ·
1 Parent(s): bd98692

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -84
app.py CHANGED
@@ -3,26 +3,115 @@ import torch
3
  import torch.nn.functional as F
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
  import re
6
- import logging # Optional: Add logging for better debugging
7
-
8
- # Set up logging (optional but helpful)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
- # Streamlit page config
13
  st.set_page_config(
14
  page_title="AI Article Detection by DEJAN",
15
  page_icon="🧠",
16
  layout="wide"
17
  )
18
 
19
- # Logo as provided
20
- st.logo(
21
- image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
22
- link="https://dejan.ai/",
23
- )
24
-
25
- # Font styling
26
  st.markdown("""
27
  <link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
28
  <style>
@@ -32,22 +121,13 @@ st.markdown("""
32
  </style>
33
  """, unsafe_allow_html=True)
34
 
35
- @st.cache_resource # Cache the model and tokenizer to avoid reloading on every interaction
36
  def load_model_and_tokenizer(model_name):
37
- """Loads the model and tokenizer."""
38
- logger.info(f"Loading tokenizer: {model_name}")
39
  tokenizer = AutoTokenizer.from_pretrained(model_name)
40
-
41
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
- dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
43
- logger.info(f"Using device: {device} with dtype: {dtype}")
44
-
45
- logger.info(f"Loading model: {model_name}")
46
  model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
47
- model.to(device)
48
- model.eval()
49
- logger.info("Model loaded successfully.")
50
-
51
  return tokenizer, model, device
52
 
53
  MODEL_NAME = "dejanseo/ai-detection-small"
@@ -55,73 +135,60 @@ try:
55
  tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
56
  except Exception as e:
57
  st.error(f"Error loading model: {e}")
58
- logger.error(f"Failed to load model or tokenizer: {e}", exc_info=True)
59
  st.stop()
60
 
61
- # Labels
62
- LABELS = ["AI Content", "Human Content"]
63
-
64
- # Regex-based sentence splitter
65
  def sent_tokenize(text):
66
- sentences = re.split(r'(?<=[\.!?])\s+', text.strip())
67
- return [s for s in sentences if s]
68
 
69
- # UI
70
  st.title("AI Article Detection")
71
- text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here...")
 
72
 
73
  if st.button("Classify", type="primary"):
74
- if not text or not text.strip():
75
  st.warning("Please enter some text.")
76
  else:
77
- with st.spinner("Analyzing... Please wait."):
78
- try:
79
- sentences = sent_tokenize(text)
80
- if not sentences:
81
- st.warning("No sentences detected.")
82
- st.stop()
83
-
84
- # Tokenize sentences
85
- inputs = tokenizer(
86
- sentences,
87
- return_tensors="pt",
88
- padding=True,
89
- truncation=True,
90
- max_length=model.config.max_position_embeddings
91
- ).to(device)
92
-
93
- # Inference
94
- with torch.no_grad():
95
- outputs = model(**inputs)
96
- logits = outputs.logits
97
- probs = F.softmax(logits, dim=-1).cpu() # [n_sentences, 2]
98
- preds = torch.argmax(probs, dim=-1).cpu()
99
-
100
- # Build inline styled text
101
- styled_chunks = []
102
- for i, sent in enumerate(sentences):
103
- pred = preds[i].item()
104
- # red for AI (class 0), green for Human (class 1)
105
- r, g = (255, 0) if pred == 0 else (0, 255)
106
- confidence = probs[i, pred].item() # 0.0–1.0
107
- alpha = confidence # opacity
108
- span = (
109
- f"<span "
110
- f"style='background-color: rgba({r},{g},0,{alpha:.2f}); "
111
- f"padding:2px; margin:0 2px; border-radius:3px;'>"
112
- f"{sent}"
113
- f"</span>"
114
- )
115
- styled_chunks.append(span)
116
-
117
- full_text_html = "".join(styled_chunks)
118
- st.markdown(full_text_html, unsafe_allow_html=True)
119
-
120
- # Overall AI likelihood (class 0)
121
- avg_probs = torch.mean(probs, dim=0)
122
- ai_likelihood = avg_probs[0].item() * 100
123
- st.subheader(f"🤖 AI Likelihood: {ai_likelihood:.1f}%")
124
-
125
- except Exception as e:
126
- st.error(f"An error occurred during analysis: {e}")
127
- logger.error("Analysis failed", exc_info=True)
 
3
  import torch.nn.functional as F
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
  import re
6
+ import math
7
+ import logging
8
+
9
+ # --- Heuristic weights & config (from your PHP) ---
10
+ SIGMOID_K = 0.5
11
+
12
+ AI_WEIGHTS = {
13
+ 'users': 1.0000, 'significant': 0.8061, 'despite': 0.5934, 'additionally': 0.5313,
14
+ 'potential': 0.4900, 'features': 0.4824, 'various': 0.4785, 'regarding': 0.4689,
15
+ 'remains': 0.4403, 'featuring': 0.4031, 'experience': 0.4026, 'including': 0.3963,
16
+ 'challenges': 0.3548, 'allowing': 0.3528, 'enhance': 0.3437, 'aims': 0.3238,
17
+ 'leading': 0.3064, 'user': 0.3054, 'recent': 0.2717, 'concerns': 0.2707,
18
+ 'capabilities': 0.2684, 'technology': 0.2636, 'devices': 0.2616, 'following': 0.2551,
19
+ 'anticipated': 0.2484, 'unique': 0.2418, 'expressed': 0.2403, 'innovative': 0.2383,
20
+ 'design': 0.2379, 'remain': 0.2371, 'previous': 0.2331, 'priced': 0.2325,
21
+ 'launch': 0.2324, 'enhancing': 0.2319, 'showcasing': 0.2305, 'feature': 0.2288,
22
+ 'particularly': 0.2287, 'set': 0.2228, 'aimed': 0.2216, 'highlighted': 0.2192,
23
+ 'ongoing': 0.2188, 'access': 0.2182, 'available': 0.2159, 'alongside': 0.2144,
24
+ 'introduced': 0.2133, 'previously': 0.2122, 'highlighting': 0.2113, 'models': 0.2081,
25
+ 'faced': 0.2057, 'platforms': 0.2055, 'updates': 0.2037, 'offers': 0.2032,
26
+ 'significantly': 0.2027, 'issues': 0.2021, 'emphasized': 0.1977, 'initially': 0.1955,
27
+ 'content': 0.1926, 'emphasizing': 0.1924, 'options': 0.1874, 'performance': 0.1864,
28
+ 'initial': 0.1832, 'notable': 0.1821, 'additional': 0.1812, 'individuals': 0.1804,
29
+ 'initiative': 0.1802, 'enhanced': 0.1797, 'release': 0.1797, 'currently': 0.1790,
30
+ 'traditional': 0.1769, 'future': 0.1731, 'expected': 0.1725, 'applications': 0.1707,
31
+ 'indicating': 0.1699, 'notably': 0.1658, 'insights': 0.1656, 'noted': 0.1645,
32
+ 'players': 0.1645, 'narrative': 0.1642, 'landscape': 0.1640, 'upcoming': 0.1634,
33
+ 'providing': 0.1631, 'offering': 0.1615, 'enabling': 0.1610, 'gaming': 0.1595,
34
+ 'compared': 0.1553, 'indicated': 0.1539, 'extensive': 0.1530, 'approach': 0.1521,
35
+ 'allows': 0.1519, 'stated': 0.1519, 'development': 0.1515, 'commitment': 0.1495,
36
+ 'highlights': 0.1493, 'essential': 0.1483, 'experiences': 0.1480, 'recently': 0.1471,
37
+ 'suggesting': 0.1457, 'market': 0.1447, 'uncertain': 0.1440, 'potentially': 0.1433
38
+ }
39
+
40
+ OG_WEIGHTS = {
41
+ 'says': 1.0000, 'just': 0.9623, 'people': 0.8774, 'said': 0.8259, 'company': 0.7645,
42
+ 'll': 0.6372, 'make': 0.6237, 'time': 0.5634, 'way': 0.5374, 've': 0.5039,
43
+ 'want': 0.4435, 'like': 0.4426, 'don': 0.4338, 'going': 0.4160, 'really': 0.4126,
44
+ 'use': 0.3769, 'good': 0.3718, 'lot': 0.3710, 'able': 0.3611, 'things': 0.3595,
45
+ 'big': 0.3483, 'doesn': 0.3470, 'right': 0.3453, 'work': 0.3443, 'new': 0.3381,
46
+ 'know': 0.3355, 'think': 0.3218, 'today': 0.3209, 'isn': 0.3039, 'look': 0.3013,
47
+ 'world': 0.2907, 'say': 0.2875, 'best': 0.2825, 'used': 0.2758, 'little': 0.2735,
48
+ 'actually': 0.2724, 'phone': 0.2551, 'thing': 0.2477, 'year': 0.2461, 'come': 0.2328,
49
+ 'told': 0.2315, 'far': 0.2250, 'better': 0.2245, 'didn': 0.2244, 'getting': 0.2209,
50
+ 'help': 0.2193, 'makes': 0.2141, 'got': 0.2139, 'won': 0.2096, 'called': 0.2078,
51
+ 'different': 0.2010, 'verge': 0.2001, 'game': 0.1956, 'looks': 0.1954, 'comes': 0.1953,
52
+ 'years': 0.1935, 'working': 0.1924, 'kind': 0.1899, 'let': 0.1891, 'great': 0.1878,
53
+ 'read': 0.1876, 'number': 0.1868, 'long': 0.1852, 'according': 0.1795, 'coming': 0.1784,
54
+ 'day': 0.1750, 'pretty': 0.1734, 'looking': 0.1685, 'bit': 0.1682, 'place': 0.1677,
55
+ 'start': 0.1667, 'trying': 0.1661, 'sure': 0.1655, 'means': 0.1642, 'course': 0.1641,
56
+ 'week': 0.1637, 'story': 0.1610, 'buy': 0.1589, 'probably': 0.1581, 'play': 0.1561,
57
+ 'using': 0.1554, 'doing': 0.1551, 'hard': 0.1525, 'did': 0.1509, 'money': 0.1497,
58
+ 'point': 0.1472, 'idea': 0.1429, 'end': 0.1425, 'aren': 0.1396, 'fact': 0.1371,
59
+ 'run': 0.1363, 'does': 0.1362, 'case': 0.1331, 'built': 0.1301, 'biggest': 0.1300,
60
+ 'started': 0.1286, 'exactly': 0.1279, 'screen': 0.1277, 'deal': 0.1264, 'apps': 0.1234
61
+ }
62
+
63
+ def tokenize(text):
64
+ return re.findall(r'\b[a-z]{2,}\b', text.lower())
65
+
66
+ def classify_text_likelihood(text: str) -> float:
67
+ tokens = tokenize(text)
68
+ if not tokens:
69
+ return 0.5
70
+ ai_score = og_score = matched = 0
71
+ for t in tokens:
72
+ aw = AI_WEIGHTS.get(t, 0)
73
+ ow = OG_WEIGHTS.get(t, 0)
74
+ if aw or ow:
75
+ matched += 1
76
+ ai_score += aw
77
+ og_score += ow
78
+ if matched == 0:
79
+ return 0.5
80
+ net = ai_score - og_score
81
+ return 1 / (1 + math.exp(-SIGMOID_K * net))
82
+
83
+ # Wrap words in thick colored underlines based on heuristic
84
+ def highlight_heuristic_words(text: str) -> str:
85
+ parts = re.split(r'(\b[a-z]{2,}\b)', text)
86
+ out = []
87
+ for part in parts:
88
+ lower = part.lower()
89
+ if lower in AI_WEIGHTS:
90
+ out.append(
91
+ f"<span style='text-decoration: underline; "
92
+ f"text-decoration-color: darkred; text-decoration-thickness: 2px;'>"
93
+ f"{part}</span>"
94
+ )
95
+ elif lower in OG_WEIGHTS:
96
+ out.append(
97
+ f"<span style='text-decoration: underline; "
98
+ f"text-decoration-color: darkgreen; text-decoration-thickness: 2px;'>"
99
+ f"{part}</span>"
100
+ )
101
+ else:
102
+ out.append(part)
103
+ return ''.join(out)
104
+
105
+ # --- Logging & Streamlit setup ---
106
  logging.basicConfig(level=logging.INFO)
107
  logger = logging.getLogger(__name__)
108
 
 
109
  st.set_page_config(
110
  page_title="AI Article Detection by DEJAN",
111
  page_icon="🧠",
112
  layout="wide"
113
  )
114
 
 
 
 
 
 
 
 
115
  st.markdown("""
116
  <link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
117
  <style>
 
121
  </style>
122
  """, unsafe_allow_html=True)
123
 
124
+ @st.cache_resource
125
  def load_model_and_tokenizer(model_name):
 
 
126
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
127
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
128
+ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
 
 
 
129
  model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
130
+ model.to(device).eval()
 
 
 
131
  return tokenizer, model, device
132
 
133
  MODEL_NAME = "dejanseo/ai-detection-small"
 
135
  tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
136
  except Exception as e:
137
  st.error(f"Error loading model: {e}")
138
+ logger.error(f"Failed to load model: {e}", exc_info=True)
139
  st.stop()
140
 
 
 
 
 
141
  def sent_tokenize(text):
142
+ return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s]
 
143
 
 
144
  st.title("AI Article Detection")
145
+
146
+ text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…")
147
 
148
  if st.button("Classify", type="primary"):
149
+ if not text.strip():
150
  st.warning("Please enter some text.")
151
  else:
152
+ with st.spinner("Analyzing"):
153
+ sentences = sent_tokenize(text)
154
+ if not sentences:
155
+ st.warning("No sentences detected.")
156
+ st.stop()
157
+
158
+ inputs = tokenizer(
159
+ sentences,
160
+ return_tensors="pt",
161
+ padding=True,
162
+ truncation=True,
163
+ max_length=model.config.max_position_embeddings
164
+ ).to(device)
165
+
166
+ with torch.no_grad():
167
+ logits = model(**inputs).logits
168
+ probs = F.softmax(logits, dim=-1).cpu()
169
+ preds = torch.argmax(probs, dim=-1).cpu()
170
+
171
+ # Highlight each sentence and underline heuristic words
172
+ chunks = []
173
+ for i, s in enumerate(sentences):
174
+ inner = highlight_heuristic_words(s)
175
+ p = preds[i].item()
176
+ r, g = (255, 0) if p == 0 else (0, 255)
177
+ conf = probs[i, p].item()
178
+ alpha = conf
179
+ span = (
180
+ f"<span style='background-color: rgba({r},{g},0,{alpha:.2f}); "
181
+ f"padding:2px; margin:0 2px; border-radius:3px;'>{inner}</span>"
182
+ )
183
+ chunks.append(span)
184
+ st.markdown("".join(chunks), unsafe_allow_html=True)
185
+
186
+ # Scores
187
+ avg = torch.mean(probs, dim=0)
188
+ model_ai = avg[0].item()
189
+ heuristic_ai = classify_text_likelihood(text)
190
+ combined = min(model_ai + heuristic_ai, 1.0)
191
+
192
+ st.subheader(f"🤖 Model AI Likelihood: {model_ai*100:.1f}%")
193
+ st.subheader(f"🛠️ Heuristic AI Likelihood: {heuristic_ai*100:.1f}%")
194
+ st.subheader(f"⚖️ Combined AI Likelihood: {combined*100:.1f}%")