import os import json import ast import streamlit as st import torch import torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForSequenceClassification import re import math import logging st.set_page_config( page_title="AI Article Detection by DEJAN", page_icon="🧠", layout="wide" ) # --- Load heuristic weights from environment secrets, with JSON→Python fallback --- @st.cache_resource def load_heuristic_weights(): def _load(env_key): raw = os.environ[env_key] try: return json.loads(raw) except json.JSONDecodeError: return ast.literal_eval(raw) ai = _load("AI_WEIGHTS_JSON") og = _load("OG_WEIGHTS_JSON") return ai, og AI_WEIGHTS, OG_WEIGHTS = load_heuristic_weights() SIGMOID_K = 0.5 def tokenize(text): return re.findall(r'\b[a-z]{2,}\b', text.lower()) def classify_text_likelihood(text: str) -> float: tokens = tokenize(text) if not tokens: return 0.5 ai_score = og_score = matched = 0 for t in tokens: aw = AI_WEIGHTS.get(t, 0) ow = OG_WEIGHTS.get(t, 0) if aw or ow: matched += 1 ai_score += aw og_score += ow if matched == 0: return 0.5 net = ai_score - og_score return 1 / (1 + math.exp(-SIGMOID_K * net)) def highlight_heuristic_words(text: str) -> str: parts = re.split(r'(\b[a-z]{2,}\b)', text) out = [] for part in parts: lower = part.lower() if lower in AI_WEIGHTS: out.append( f"" f"{part}" ) elif lower in OG_WEIGHTS: out.append( f"" f"{part}" ) else: out.append(part) return ''.join(out) # --- Logging & Streamlit setup --- logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def load_model_and_tokenizer(model_name): tokenizer = AutoTokenizer.from_pretrained(model_name) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32 model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype) model.to(device).eval() return tokenizer, model, device MODEL_NAME = "dejanseo/ai-detection-small" try: tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME) except Exception as e: st.error(f"Error loading model: {e}") logger.error(f"Failed to load model: {e}", exc_info=True) st.stop() def sent_tokenize(text): return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s] st.title("AI Article Detection") text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…") if st.button("Classify", type="primary"): if not text.strip(): st.warning("Please enter some text.") else: with st.spinner("Analyzing…"): sentences = sent_tokenize(text) if not sentences: st.warning("No sentences detected.") st.stop() inputs = tokenizer( sentences, return_tensors="pt", padding=True, truncation=True, max_length=model.config.max_position_embeddings ).to(device) with torch.no_grad(): logits = model(**inputs).logits probs = F.softmax(logits, dim=-1).cpu() preds = torch.argmax(probs, dim=-1).cpu() chunks = [] for i, s in enumerate(sentences): inner = highlight_heuristic_words(s) p = preds[i].item() r, g = (255, 0) if p == 0 else (0, 255) conf = probs[i, p].item() alpha = conf span = ( f"{inner}" ) chunks.append(span) st.markdown("".join(chunks), unsafe_allow_html=True) avg = torch.mean(probs, dim=0) model_ai = avg[0].item() heuristic_ai = classify_text_likelihood(text) combined = min(model_ai + heuristic_ai, 1.0) st.subheader(f"🤖 Model AI Likelihood: {model_ai*100:.1f}%") st.subheader(f"🛠️ Heuristic AI Likelihood: {heuristic_ai*100:.1f}%") st.subheader(f"⚖️ Combined AI Likelihood: {combined*100:.1f}%")