Spaces:
Running
Running
import os | |
import json | |
import ast | |
import streamlit as st | |
import torch | |
import torch.nn.functional as F | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import re | |
import math | |
import logging | |
st.set_page_config( | |
page_title="AI Article Detection by DEJAN", | |
page_icon="🧠", | |
layout="wide" | |
) | |
# --- Load heuristic weights from environment secrets, with JSON→Python fallback --- | |
def load_heuristic_weights(): | |
def _load(env_key): | |
raw = os.environ[env_key] | |
try: | |
return json.loads(raw) | |
except json.JSONDecodeError: | |
return ast.literal_eval(raw) | |
ai = _load("AI_WEIGHTS_JSON") | |
og = _load("OG_WEIGHTS_JSON") | |
return ai, og | |
AI_WEIGHTS, OG_WEIGHTS = load_heuristic_weights() | |
SIGMOID_K = 0.5 | |
def tokenize(text): | |
return re.findall(r'\b[a-z]{2,}\b', text.lower()) | |
def classify_text_likelihood(text: str) -> float: | |
tokens = tokenize(text) | |
if not tokens: | |
return 0.5 | |
ai_score = og_score = matched = 0 | |
for t in tokens: | |
aw = AI_WEIGHTS.get(t, 0) | |
ow = OG_WEIGHTS.get(t, 0) | |
if aw or ow: | |
matched += 1 | |
ai_score += aw | |
og_score += ow | |
if matched == 0: | |
return 0.5 | |
net = ai_score - og_score | |
return 1 / (1 + math.exp(-SIGMOID_K * net)) | |
def highlight_heuristic_words(text: str) -> str: | |
parts = re.split(r'(\b[a-z]{2,}\b)', text) | |
out = [] | |
for part in parts: | |
lower = part.lower() | |
if lower in AI_WEIGHTS: | |
out.append( | |
f"<span style='text-decoration: underline; " | |
f"text-decoration-color: darkred; text-decoration-thickness: 2px;'>" | |
f"{part}</span>" | |
) | |
elif lower in OG_WEIGHTS: | |
out.append( | |
f"<span style='text-decoration: underline; " | |
f"text-decoration-color: darkgreen; text-decoration-thickness: 2px;'>" | |
f"{part}</span>" | |
) | |
else: | |
out.append(part) | |
return ''.join(out) | |
# --- Logging & Streamlit setup --- | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
st.markdown(""" | |
<link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet"> | |
<style> | |
html, body, [class*="css"] { | |
font-family: 'Roboto', sans-serif; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
def load_model_and_tokenizer(model_name): | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32 | |
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype) | |
model.to(device).eval() | |
return tokenizer, model, device | |
MODEL_NAME = "dejanseo/ai-detection-small" | |
try: | |
tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME) | |
except Exception as e: | |
st.error(f"Error loading model: {e}") | |
logger.error(f"Failed to load model: {e}", exc_info=True) | |
st.stop() | |
def sent_tokenize(text): | |
return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s] | |
st.title("AI Article Detection") | |
text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…") | |
if st.button("Classify", type="primary"): | |
if not text.strip(): | |
st.warning("Please enter some text.") | |
else: | |
with st.spinner("Analyzing…"): | |
sentences = sent_tokenize(text) | |
if not sentences: | |
st.warning("No sentences detected.") | |
st.stop() | |
inputs = tokenizer( | |
sentences, | |
return_tensors="pt", | |
padding=True, | |
truncation=True, | |
max_length=model.config.max_position_embeddings | |
).to(device) | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
probs = F.softmax(logits, dim=-1).cpu() | |
preds = torch.argmax(probs, dim=-1).cpu() | |
chunks = [] | |
for i, s in enumerate(sentences): | |
inner = highlight_heuristic_words(s) | |
p = preds[i].item() | |
r, g = (255, 0) if p == 0 else (0, 255) | |
conf = probs[i, p].item() | |
alpha = conf | |
span = ( | |
f"<span style='background-color: rgba({r},{g},0,{alpha:.2f}); " | |
f"padding:2px; margin:0 2px; border-radius:3px;'>{inner}</span>" | |
) | |
chunks.append(span) | |
st.markdown("".join(chunks), unsafe_allow_html=True) | |
avg = torch.mean(probs, dim=0) | |
model_ai = avg[0].item() | |
heuristic_ai = classify_text_likelihood(text) | |
combined = min(model_ai + heuristic_ai, 1.0) | |
st.subheader(f"🤖 Model AI Likelihood: {model_ai*100:.1f}%") | |
st.subheader(f"🛠️ Heuristic AI Likelihood: {heuristic_ai*100:.1f}%") | |
st.subheader(f"⚖️ Combined AI Likelihood: {combined*100:.1f}%") | |