dejanseo's picture
Update app.py
c2f4b21 verified
import os
import json
import ast
import streamlit as st
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import math
import logging
import pandas as pd
st.set_page_config(
page_title="AI Article Detection by DEJAN",
page_icon="🧠",
layout="wide"
)
st.logo(
image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
link="https://dejan.ai/",
)
# --- Load heuristic weights from environment secrets, with JSON→Python fallback ---
@st.cache_resource
def load_heuristic_weights():
def _load(env_key):
raw = os.environ[env_key]
try:
return json.loads(raw)
except json.JSONDecodeError:
return ast.literal_eval(raw)
ai = _load("AI_WEIGHTS_JSON")
og = _load("OG_WEIGHTS_JSON")
return ai, og
AI_WEIGHTS, OG_WEIGHTS = load_heuristic_weights()
SIGMOID_K = 0.5
def tokenize(text):
return re.findall(r'\b[a-z]{2,}\b', text.lower())
def classify_text_likelihood(text: str) -> float:
tokens = tokenize(text)
if not tokens:
return 0.5
ai_score = og_score = matched = 0
for t in tokens:
aw = AI_WEIGHTS.get(t, 0)
ow = OG_WEIGHTS.get(t, 0)
if aw or ow:
matched += 1
ai_score += aw
og_score += ow
if matched == 0:
return 0.5
net = ai_score - og_score
return 1 / (1 + math.exp(-SIGMOID_K * net))
# --- Logging & Streamlit setup ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
st.markdown("""
<link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
<style>
html, body, [class*="css"] {
font-family: 'Roboto', sans-serif;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def load_model_and_tokenizer(model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
model.to(device).eval()
return tokenizer, model, device
MODEL_NAME = "dejanseo/ai-cop"
try:
tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
except Exception as e:
st.error(f"Error loading model: {e}")
logger.error(f"Failed to load model: {e}", exc_info=True)
st.stop()
def sent_tokenize(text):
return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s]
st.title("AI Article Detection")
text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…")
if st.button("Classify", type="primary"):
if not text.strip():
st.warning("Please enter some text.")
else:
with st.spinner("Analyzing…"):
sentences = sent_tokenize(text)
if not sentences:
st.warning("No sentences detected.")
st.stop()
inputs = tokenizer(
sentences,
return_tensors="pt",
padding=True,
truncation=True,
max_length=model.config.max_position_embeddings
).to(device)
with torch.no_grad():
logits = model(**inputs).logits
probs = F.softmax(logits, dim=-1).cpu()
preds = torch.argmax(probs, dim=-1).cpu()
# Create dataframe for sentences
sentences_data = []
for i, s in enumerate(sentences):
p = preds[i].item()
conf = probs[i, p].item()
label = "AI" if p == 0 else "Human"
sentences_data.append({
"sentence": s,
"classification": label,
"confidence": conf
})
# Display as dataframe with progress column
df = pd.DataFrame(sentences_data)
st.dataframe(
df,
column_config={
"sentence": st.column_config.TextColumn("Sentence"),
"classification": st.column_config.TextColumn("Classification"),
"confidence": st.column_config.ProgressColumn(
"Confidence",
help="Model's confidence in the classification",
format="%.2f",
min_value=0,
max_value=1,
),
},
hide_index=True,
)
avg = torch.mean(probs, dim=0)
model_ai = avg[0].item()
heuristic_ai = classify_text_likelihood(text)
combined = min(model_ai + heuristic_ai, 1.0)
st.subheader(f"⚖️ AI Likelihood: {combined*100:.1f}%")
st.write(f"🤖 Model: {model_ai*100:.1f}%")
st.write(f"🛠️ Heuristic: {heuristic_ai*100:.1f}%")