Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,26 +3,115 @@ import torch
|
|
3 |
import torch.nn.functional as F
|
4 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
5 |
import re
|
6 |
-
import
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
-
# Streamlit page config
|
13 |
st.set_page_config(
|
14 |
page_title="AI Article Detection by DEJAN",
|
15 |
page_icon="🧠",
|
16 |
layout="wide"
|
17 |
)
|
18 |
|
19 |
-
# Logo as provided
|
20 |
-
st.logo(
|
21 |
-
image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
|
22 |
-
link="https://dejan.ai/",
|
23 |
-
)
|
24 |
-
|
25 |
-
# Font styling
|
26 |
st.markdown("""
|
27 |
<link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
|
28 |
<style>
|
@@ -32,22 +121,13 @@ st.markdown("""
|
|
32 |
</style>
|
33 |
""", unsafe_allow_html=True)
|
34 |
|
35 |
-
@st.cache_resource
|
36 |
def load_model_and_tokenizer(model_name):
|
37 |
-
"""Loads the model and tokenizer."""
|
38 |
-
logger.info(f"Loading tokenizer: {model_name}")
|
39 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
40 |
-
|
41 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
42 |
-
dtype = torch.bfloat16 if (device.type
|
43 |
-
logger.info(f"Using device: {device} with dtype: {dtype}")
|
44 |
-
|
45 |
-
logger.info(f"Loading model: {model_name}")
|
46 |
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
|
47 |
-
model.to(device)
|
48 |
-
model.eval()
|
49 |
-
logger.info("Model loaded successfully.")
|
50 |
-
|
51 |
return tokenizer, model, device
|
52 |
|
53 |
MODEL_NAME = "dejanseo/ai-detection-small"
|
@@ -55,73 +135,60 @@ try:
|
|
55 |
tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
|
56 |
except Exception as e:
|
57 |
st.error(f"Error loading model: {e}")
|
58 |
-
logger.error(f"Failed to load model
|
59 |
st.stop()
|
60 |
|
61 |
-
# Labels
|
62 |
-
LABELS = ["AI Content", "Human Content"]
|
63 |
-
|
64 |
-
# Regex-based sentence splitter
|
65 |
def sent_tokenize(text):
|
66 |
-
|
67 |
-
return [s for s in sentences if s]
|
68 |
|
69 |
-
# UI
|
70 |
st.title("AI Article Detection")
|
71 |
-
|
|
|
72 |
|
73 |
if st.button("Classify", type="primary"):
|
74 |
-
if not text
|
75 |
st.warning("Please enter some text.")
|
76 |
else:
|
77 |
-
with st.spinner("Analyzing
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
# Overall AI likelihood (class 0)
|
121 |
-
avg_probs = torch.mean(probs, dim=0)
|
122 |
-
ai_likelihood = avg_probs[0].item() * 100
|
123 |
-
st.subheader(f"🤖 AI Likelihood: {ai_likelihood:.1f}%")
|
124 |
-
|
125 |
-
except Exception as e:
|
126 |
-
st.error(f"An error occurred during analysis: {e}")
|
127 |
-
logger.error("Analysis failed", exc_info=True)
|
|
|
3 |
import torch.nn.functional as F
|
4 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
5 |
import re
|
6 |
+
import math
|
7 |
+
import logging
|
8 |
+
|
9 |
+
# --- Heuristic weights & config (from your PHP) ---
|
10 |
+
SIGMOID_K = 0.5
|
11 |
+
|
12 |
+
AI_WEIGHTS = {
|
13 |
+
'users': 1.0000, 'significant': 0.8061, 'despite': 0.5934, 'additionally': 0.5313,
|
14 |
+
'potential': 0.4900, 'features': 0.4824, 'various': 0.4785, 'regarding': 0.4689,
|
15 |
+
'remains': 0.4403, 'featuring': 0.4031, 'experience': 0.4026, 'including': 0.3963,
|
16 |
+
'challenges': 0.3548, 'allowing': 0.3528, 'enhance': 0.3437, 'aims': 0.3238,
|
17 |
+
'leading': 0.3064, 'user': 0.3054, 'recent': 0.2717, 'concerns': 0.2707,
|
18 |
+
'capabilities': 0.2684, 'technology': 0.2636, 'devices': 0.2616, 'following': 0.2551,
|
19 |
+
'anticipated': 0.2484, 'unique': 0.2418, 'expressed': 0.2403, 'innovative': 0.2383,
|
20 |
+
'design': 0.2379, 'remain': 0.2371, 'previous': 0.2331, 'priced': 0.2325,
|
21 |
+
'launch': 0.2324, 'enhancing': 0.2319, 'showcasing': 0.2305, 'feature': 0.2288,
|
22 |
+
'particularly': 0.2287, 'set': 0.2228, 'aimed': 0.2216, 'highlighted': 0.2192,
|
23 |
+
'ongoing': 0.2188, 'access': 0.2182, 'available': 0.2159, 'alongside': 0.2144,
|
24 |
+
'introduced': 0.2133, 'previously': 0.2122, 'highlighting': 0.2113, 'models': 0.2081,
|
25 |
+
'faced': 0.2057, 'platforms': 0.2055, 'updates': 0.2037, 'offers': 0.2032,
|
26 |
+
'significantly': 0.2027, 'issues': 0.2021, 'emphasized': 0.1977, 'initially': 0.1955,
|
27 |
+
'content': 0.1926, 'emphasizing': 0.1924, 'options': 0.1874, 'performance': 0.1864,
|
28 |
+
'initial': 0.1832, 'notable': 0.1821, 'additional': 0.1812, 'individuals': 0.1804,
|
29 |
+
'initiative': 0.1802, 'enhanced': 0.1797, 'release': 0.1797, 'currently': 0.1790,
|
30 |
+
'traditional': 0.1769, 'future': 0.1731, 'expected': 0.1725, 'applications': 0.1707,
|
31 |
+
'indicating': 0.1699, 'notably': 0.1658, 'insights': 0.1656, 'noted': 0.1645,
|
32 |
+
'players': 0.1645, 'narrative': 0.1642, 'landscape': 0.1640, 'upcoming': 0.1634,
|
33 |
+
'providing': 0.1631, 'offering': 0.1615, 'enabling': 0.1610, 'gaming': 0.1595,
|
34 |
+
'compared': 0.1553, 'indicated': 0.1539, 'extensive': 0.1530, 'approach': 0.1521,
|
35 |
+
'allows': 0.1519, 'stated': 0.1519, 'development': 0.1515, 'commitment': 0.1495,
|
36 |
+
'highlights': 0.1493, 'essential': 0.1483, 'experiences': 0.1480, 'recently': 0.1471,
|
37 |
+
'suggesting': 0.1457, 'market': 0.1447, 'uncertain': 0.1440, 'potentially': 0.1433
|
38 |
+
}
|
39 |
+
|
40 |
+
OG_WEIGHTS = {
|
41 |
+
'says': 1.0000, 'just': 0.9623, 'people': 0.8774, 'said': 0.8259, 'company': 0.7645,
|
42 |
+
'll': 0.6372, 'make': 0.6237, 'time': 0.5634, 'way': 0.5374, 've': 0.5039,
|
43 |
+
'want': 0.4435, 'like': 0.4426, 'don': 0.4338, 'going': 0.4160, 'really': 0.4126,
|
44 |
+
'use': 0.3769, 'good': 0.3718, 'lot': 0.3710, 'able': 0.3611, 'things': 0.3595,
|
45 |
+
'big': 0.3483, 'doesn': 0.3470, 'right': 0.3453, 'work': 0.3443, 'new': 0.3381,
|
46 |
+
'know': 0.3355, 'think': 0.3218, 'today': 0.3209, 'isn': 0.3039, 'look': 0.3013,
|
47 |
+
'world': 0.2907, 'say': 0.2875, 'best': 0.2825, 'used': 0.2758, 'little': 0.2735,
|
48 |
+
'actually': 0.2724, 'phone': 0.2551, 'thing': 0.2477, 'year': 0.2461, 'come': 0.2328,
|
49 |
+
'told': 0.2315, 'far': 0.2250, 'better': 0.2245, 'didn': 0.2244, 'getting': 0.2209,
|
50 |
+
'help': 0.2193, 'makes': 0.2141, 'got': 0.2139, 'won': 0.2096, 'called': 0.2078,
|
51 |
+
'different': 0.2010, 'verge': 0.2001, 'game': 0.1956, 'looks': 0.1954, 'comes': 0.1953,
|
52 |
+
'years': 0.1935, 'working': 0.1924, 'kind': 0.1899, 'let': 0.1891, 'great': 0.1878,
|
53 |
+
'read': 0.1876, 'number': 0.1868, 'long': 0.1852, 'according': 0.1795, 'coming': 0.1784,
|
54 |
+
'day': 0.1750, 'pretty': 0.1734, 'looking': 0.1685, 'bit': 0.1682, 'place': 0.1677,
|
55 |
+
'start': 0.1667, 'trying': 0.1661, 'sure': 0.1655, 'means': 0.1642, 'course': 0.1641,
|
56 |
+
'week': 0.1637, 'story': 0.1610, 'buy': 0.1589, 'probably': 0.1581, 'play': 0.1561,
|
57 |
+
'using': 0.1554, 'doing': 0.1551, 'hard': 0.1525, 'did': 0.1509, 'money': 0.1497,
|
58 |
+
'point': 0.1472, 'idea': 0.1429, 'end': 0.1425, 'aren': 0.1396, 'fact': 0.1371,
|
59 |
+
'run': 0.1363, 'does': 0.1362, 'case': 0.1331, 'built': 0.1301, 'biggest': 0.1300,
|
60 |
+
'started': 0.1286, 'exactly': 0.1279, 'screen': 0.1277, 'deal': 0.1264, 'apps': 0.1234
|
61 |
+
}
|
62 |
+
|
63 |
+
def tokenize(text):
|
64 |
+
return re.findall(r'\b[a-z]{2,}\b', text.lower())
|
65 |
+
|
66 |
+
def classify_text_likelihood(text: str) -> float:
|
67 |
+
tokens = tokenize(text)
|
68 |
+
if not tokens:
|
69 |
+
return 0.5
|
70 |
+
ai_score = og_score = matched = 0
|
71 |
+
for t in tokens:
|
72 |
+
aw = AI_WEIGHTS.get(t, 0)
|
73 |
+
ow = OG_WEIGHTS.get(t, 0)
|
74 |
+
if aw or ow:
|
75 |
+
matched += 1
|
76 |
+
ai_score += aw
|
77 |
+
og_score += ow
|
78 |
+
if matched == 0:
|
79 |
+
return 0.5
|
80 |
+
net = ai_score - og_score
|
81 |
+
return 1 / (1 + math.exp(-SIGMOID_K * net))
|
82 |
+
|
83 |
+
# Wrap words in thick colored underlines based on heuristic
|
84 |
+
def highlight_heuristic_words(text: str) -> str:
|
85 |
+
parts = re.split(r'(\b[a-z]{2,}\b)', text)
|
86 |
+
out = []
|
87 |
+
for part in parts:
|
88 |
+
lower = part.lower()
|
89 |
+
if lower in AI_WEIGHTS:
|
90 |
+
out.append(
|
91 |
+
f"<span style='text-decoration: underline; "
|
92 |
+
f"text-decoration-color: darkred; text-decoration-thickness: 2px;'>"
|
93 |
+
f"{part}</span>"
|
94 |
+
)
|
95 |
+
elif lower in OG_WEIGHTS:
|
96 |
+
out.append(
|
97 |
+
f"<span style='text-decoration: underline; "
|
98 |
+
f"text-decoration-color: darkgreen; text-decoration-thickness: 2px;'>"
|
99 |
+
f"{part}</span>"
|
100 |
+
)
|
101 |
+
else:
|
102 |
+
out.append(part)
|
103 |
+
return ''.join(out)
|
104 |
+
|
105 |
+
# --- Logging & Streamlit setup ---
|
106 |
logging.basicConfig(level=logging.INFO)
|
107 |
logger = logging.getLogger(__name__)
|
108 |
|
|
|
109 |
st.set_page_config(
|
110 |
page_title="AI Article Detection by DEJAN",
|
111 |
page_icon="🧠",
|
112 |
layout="wide"
|
113 |
)
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
st.markdown("""
|
116 |
<link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
|
117 |
<style>
|
|
|
121 |
</style>
|
122 |
""", unsafe_allow_html=True)
|
123 |
|
124 |
+
@st.cache_resource
|
125 |
def load_model_and_tokenizer(model_name):
|
|
|
|
|
126 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
127 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
128 |
+
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
|
|
|
|
|
|
|
129 |
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
|
130 |
+
model.to(device).eval()
|
|
|
|
|
|
|
131 |
return tokenizer, model, device
|
132 |
|
133 |
MODEL_NAME = "dejanseo/ai-detection-small"
|
|
|
135 |
tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
|
136 |
except Exception as e:
|
137 |
st.error(f"Error loading model: {e}")
|
138 |
+
logger.error(f"Failed to load model: {e}", exc_info=True)
|
139 |
st.stop()
|
140 |
|
|
|
|
|
|
|
|
|
141 |
def sent_tokenize(text):
|
142 |
+
return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s]
|
|
|
143 |
|
|
|
144 |
st.title("AI Article Detection")
|
145 |
+
|
146 |
+
text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…")
|
147 |
|
148 |
if st.button("Classify", type="primary"):
|
149 |
+
if not text.strip():
|
150 |
st.warning("Please enter some text.")
|
151 |
else:
|
152 |
+
with st.spinner("Analyzing…"):
|
153 |
+
sentences = sent_tokenize(text)
|
154 |
+
if not sentences:
|
155 |
+
st.warning("No sentences detected.")
|
156 |
+
st.stop()
|
157 |
+
|
158 |
+
inputs = tokenizer(
|
159 |
+
sentences,
|
160 |
+
return_tensors="pt",
|
161 |
+
padding=True,
|
162 |
+
truncation=True,
|
163 |
+
max_length=model.config.max_position_embeddings
|
164 |
+
).to(device)
|
165 |
+
|
166 |
+
with torch.no_grad():
|
167 |
+
logits = model(**inputs).logits
|
168 |
+
probs = F.softmax(logits, dim=-1).cpu()
|
169 |
+
preds = torch.argmax(probs, dim=-1).cpu()
|
170 |
+
|
171 |
+
# Highlight each sentence and underline heuristic words
|
172 |
+
chunks = []
|
173 |
+
for i, s in enumerate(sentences):
|
174 |
+
inner = highlight_heuristic_words(s)
|
175 |
+
p = preds[i].item()
|
176 |
+
r, g = (255, 0) if p == 0 else (0, 255)
|
177 |
+
conf = probs[i, p].item()
|
178 |
+
alpha = conf
|
179 |
+
span = (
|
180 |
+
f"<span style='background-color: rgba({r},{g},0,{alpha:.2f}); "
|
181 |
+
f"padding:2px; margin:0 2px; border-radius:3px;'>{inner}</span>"
|
182 |
+
)
|
183 |
+
chunks.append(span)
|
184 |
+
st.markdown("".join(chunks), unsafe_allow_html=True)
|
185 |
+
|
186 |
+
# Scores
|
187 |
+
avg = torch.mean(probs, dim=0)
|
188 |
+
model_ai = avg[0].item()
|
189 |
+
heuristic_ai = classify_text_likelihood(text)
|
190 |
+
combined = min(model_ai + heuristic_ai, 1.0)
|
191 |
+
|
192 |
+
st.subheader(f"🤖 Model AI Likelihood: {model_ai*100:.1f}%")
|
193 |
+
st.subheader(f"🛠️ Heuristic AI Likelihood: {heuristic_ai*100:.1f}%")
|
194 |
+
st.subheader(f"⚖️ Combined AI Likelihood: {combined*100:.1f}%")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|