import os import json import ast import streamlit as st import torch import torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForSequenceClassification import re import math import logging import pandas as pd st.set_page_config( page_title="AI Article Detection by DEJAN", page_icon="🧠", layout="wide" ) st.logo( image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png", link="https://dejan.ai/", ) # --- Load heuristic weights from environment secrets, with JSON→Python fallback --- @st.cache_resource def load_heuristic_weights(): def _load(env_key): raw = os.environ[env_key] try: return json.loads(raw) except json.JSONDecodeError: return ast.literal_eval(raw) ai = _load("AI_WEIGHTS_JSON") og = _load("OG_WEIGHTS_JSON") return ai, og AI_WEIGHTS, OG_WEIGHTS = load_heuristic_weights() SIGMOID_K = 0.5 def tokenize(text): return re.findall(r'\b[a-z]{2,}\b', text.lower()) def classify_text_likelihood(text: str) -> float: tokens = tokenize(text) if not tokens: return 0.5 ai_score = og_score = matched = 0 for t in tokens: aw = AI_WEIGHTS.get(t, 0) ow = OG_WEIGHTS.get(t, 0) if aw or ow: matched += 1 ai_score += aw og_score += ow if matched == 0: return 0.5 net = ai_score - og_score return 1 / (1 + math.exp(-SIGMOID_K * net)) # --- Logging & Streamlit setup --- logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def load_model_and_tokenizer(model_name): tokenizer = AutoTokenizer.from_pretrained(model_name) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32 model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype) model.to(device).eval() return tokenizer, model, device MODEL_NAME = "dejanseo/ai-cop" try: tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME) except Exception as e: st.error(f"Error loading model: {e}") logger.error(f"Failed to load model: {e}", exc_info=True) st.stop() def sent_tokenize(text): return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s] st.title("AI Article Detection") text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…") if st.button("Classify", type="primary"): if not text.strip(): st.warning("Please enter some text.") else: with st.spinner("Analyzing…"): sentences = sent_tokenize(text) if not sentences: st.warning("No sentences detected.") st.stop() inputs = tokenizer( sentences, return_tensors="pt", padding=True, truncation=True, max_length=model.config.max_position_embeddings ).to(device) with torch.no_grad(): logits = model(**inputs).logits probs = F.softmax(logits, dim=-1).cpu() preds = torch.argmax(probs, dim=-1).cpu() # Create dataframe for sentences sentences_data = [] for i, s in enumerate(sentences): p = preds[i].item() conf = probs[i, p].item() label = "AI" if p == 0 else "Human" sentences_data.append({ "sentence": s, "classification": label, "confidence": conf }) # Display as dataframe with progress column df = pd.DataFrame(sentences_data) st.dataframe( df, column_config={ "sentence": st.column_config.TextColumn("Sentence"), "classification": st.column_config.TextColumn("Classification"), "confidence": st.column_config.ProgressColumn( "Confidence", help="Model's confidence in the classification", format="%.2f", min_value=0, max_value=1, ), }, hide_index=True, ) avg = torch.mean(probs, dim=0) model_ai = avg[0].item() heuristic_ai = classify_text_likelihood(text) combined = min(model_ai + heuristic_ai, 1.0) st.subheader(f"⚖️ AI Likelihood: {combined*100:.1f}%") st.write(f"🤖 Model: {model_ai*100:.1f}%") st.write(f"🛠️ Heuristic: {heuristic_ai*100:.1f}%")