Spaces:
Running
Running
File size: 5,166 Bytes
d73c320 64170a0 4db9ce2 8582a8e d6bd636 64170a0 d73c320 64170a0 d73c320 8582a8e d73c320 8582a8e 4db9ce2 d6bd636 4db9ce2 bd98692 4db9ce2 8582a8e 4db9ce2 68d1553 8582a8e 68d1553 8582a8e 68d1553 4db9ce2 4f473c9 4db9ce2 8582a8e 68d1553 4db9ce2 8582a8e 4db9ce2 8582a8e 4db9ce2 8582a8e 4db9ce2 8582a8e d73c320 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import os
import json
import ast
import streamlit as st
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import math
import logging
st.set_page_config(
page_title="AI Article Detection by DEJAN",
page_icon="🧠",
layout="wide"
)
# --- Load heuristic weights from environment secrets, with JSON→Python fallback ---
@st.cache_resource
def load_heuristic_weights():
def _load(env_key):
raw = os.environ[env_key]
try:
return json.loads(raw)
except json.JSONDecodeError:
return ast.literal_eval(raw)
ai = _load("AI_WEIGHTS_JSON")
og = _load("OG_WEIGHTS_JSON")
return ai, og
AI_WEIGHTS, OG_WEIGHTS = load_heuristic_weights()
SIGMOID_K = 0.5
def tokenize(text):
return re.findall(r'\b[a-z]{2,}\b', text.lower())
def classify_text_likelihood(text: str) -> float:
tokens = tokenize(text)
if not tokens:
return 0.5
ai_score = og_score = matched = 0
for t in tokens:
aw = AI_WEIGHTS.get(t, 0)
ow = OG_WEIGHTS.get(t, 0)
if aw or ow:
matched += 1
ai_score += aw
og_score += ow
if matched == 0:
return 0.5
net = ai_score - og_score
return 1 / (1 + math.exp(-SIGMOID_K * net))
def highlight_heuristic_words(text: str) -> str:
parts = re.split(r'(\b[a-z]{2,}\b)', text)
out = []
for part in parts:
lower = part.lower()
if lower in AI_WEIGHTS:
out.append(
f"<span style='text-decoration: underline; "
f"text-decoration-color: darkred; text-decoration-thickness: 2px;'>"
f"{part}</span>"
)
elif lower in OG_WEIGHTS:
out.append(
f"<span style='text-decoration: underline; "
f"text-decoration-color: darkgreen; text-decoration-thickness: 2px;'>"
f"{part}</span>"
)
else:
out.append(part)
return ''.join(out)
# --- Logging & Streamlit setup ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
st.markdown("""
<link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
<style>
html, body, [class*="css"] {
font-family: 'Roboto', sans-serif;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def load_model_and_tokenizer(model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
model.to(device).eval()
return tokenizer, model, device
MODEL_NAME = "dejanseo/ai-detection-small"
try:
tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
except Exception as e:
st.error(f"Error loading model: {e}")
logger.error(f"Failed to load model: {e}", exc_info=True)
st.stop()
def sent_tokenize(text):
return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s]
st.title("AI Article Detection")
text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…")
if st.button("Classify", type="primary"):
if not text.strip():
st.warning("Please enter some text.")
else:
with st.spinner("Analyzing…"):
sentences = sent_tokenize(text)
if not sentences:
st.warning("No sentences detected.")
st.stop()
inputs = tokenizer(
sentences,
return_tensors="pt",
padding=True,
truncation=True,
max_length=model.config.max_position_embeddings
).to(device)
with torch.no_grad():
logits = model(**inputs).logits
probs = F.softmax(logits, dim=-1).cpu()
preds = torch.argmax(probs, dim=-1).cpu()
chunks = []
for i, s in enumerate(sentences):
inner = highlight_heuristic_words(s)
p = preds[i].item()
r, g = (255, 0) if p == 0 else (0, 255)
conf = probs[i, p].item()
alpha = conf
span = (
f"<span style='background-color: rgba({r},{g},0,{alpha:.2f}); "
f"padding:2px; margin:0 2px; border-radius:3px;'>{inner}</span>"
)
chunks.append(span)
st.markdown("".join(chunks), unsafe_allow_html=True)
avg = torch.mean(probs, dim=0)
model_ai = avg[0].item()
heuristic_ai = classify_text_likelihood(text)
combined = min(model_ai + heuristic_ai, 1.0)
st.subheader(f"🤖 Model AI Likelihood: {model_ai*100:.1f}%")
st.subheader(f"🛠️ Heuristic AI Likelihood: {heuristic_ai*100:.1f}%")
st.subheader(f"⚖️ Combined AI Likelihood: {combined*100:.1f}%")
|