Spaces:
Running
Running
import os | |
import json | |
import ast | |
import streamlit as st | |
import torch | |
import torch.nn.functional as F | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import re | |
import math | |
import logging | |
import pandas as pd | |
st.set_page_config( | |
page_title="AI Article Detection by DEJAN", | |
page_icon="🧠", | |
layout="wide" | |
) | |
st.logo( | |
image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png", | |
link="https://dejan.ai/", | |
) | |
# --- Load heuristic weights from environment secrets, with JSON→Python fallback --- | |
def load_heuristic_weights(): | |
def _load(env_key): | |
raw = os.environ[env_key] | |
try: | |
return json.loads(raw) | |
except json.JSONDecodeError: | |
return ast.literal_eval(raw) | |
ai = _load("AI_WEIGHTS_JSON") | |
og = _load("OG_WEIGHTS_JSON") | |
return ai, og | |
AI_WEIGHTS, OG_WEIGHTS = load_heuristic_weights() | |
SIGMOID_K = 0.5 | |
def tokenize(text): | |
return re.findall(r'\b[a-z]{2,}\b', text.lower()) | |
def classify_text_likelihood(text: str) -> float: | |
tokens = tokenize(text) | |
if not tokens: | |
return 0.5 | |
ai_score = og_score = matched = 0 | |
for t in tokens: | |
aw = AI_WEIGHTS.get(t, 0) | |
ow = OG_WEIGHTS.get(t, 0) | |
if aw or ow: | |
matched += 1 | |
ai_score += aw | |
og_score += ow | |
if matched == 0: | |
return 0.5 | |
net = ai_score - og_score | |
return 1 / (1 + math.exp(-SIGMOID_K * net)) | |
# --- Logging & Streamlit setup --- | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
st.markdown(""" | |
<link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet"> | |
<style> | |
html, body, [class*="css"] { | |
font-family: 'Roboto', sans-serif; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
def load_model_and_tokenizer(model_name): | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32 | |
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype) | |
model.to(device).eval() | |
return tokenizer, model, device | |
MODEL_NAME = "dejanseo/ai-cop" | |
try: | |
tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME) | |
except Exception as e: | |
st.error(f"Error loading model: {e}") | |
logger.error(f"Failed to load model: {e}", exc_info=True) | |
st.stop() | |
def sent_tokenize(text): | |
return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s] | |
st.title("AI Article Detection") | |
text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…") | |
if st.button("Classify", type="primary"): | |
if not text.strip(): | |
st.warning("Please enter some text.") | |
else: | |
with st.spinner("Analyzing…"): | |
sentences = sent_tokenize(text) | |
if not sentences: | |
st.warning("No sentences detected.") | |
st.stop() | |
inputs = tokenizer( | |
sentences, | |
return_tensors="pt", | |
padding=True, | |
truncation=True, | |
max_length=model.config.max_position_embeddings | |
).to(device) | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
probs = F.softmax(logits, dim=-1).cpu() | |
preds = torch.argmax(probs, dim=-1).cpu() | |
# Create dataframe for sentences | |
sentences_data = [] | |
for i, s in enumerate(sentences): | |
p = preds[i].item() | |
conf = probs[i, p].item() | |
label = "AI" if p == 0 else "Human" | |
sentences_data.append({ | |
"sentence": s, | |
"classification": label, | |
"confidence": conf | |
}) | |
# Display as dataframe with progress column | |
df = pd.DataFrame(sentences_data) | |
st.dataframe( | |
df, | |
column_config={ | |
"sentence": st.column_config.TextColumn("Sentence"), | |
"classification": st.column_config.TextColumn("Classification"), | |
"confidence": st.column_config.ProgressColumn( | |
"Confidence", | |
help="Model's confidence in the classification", | |
format="%.2f", | |
min_value=0, | |
max_value=1, | |
), | |
}, | |
hide_index=True, | |
) | |
avg = torch.mean(probs, dim=0) | |
model_ai = avg[0].item() | |
heuristic_ai = classify_text_likelihood(text) | |
combined = min(model_ai + heuristic_ai, 1.0) | |
st.subheader(f"⚖️ AI Likelihood: {combined*100:.1f}%") | |
st.write(f"🤖 Model: {model_ai*100:.1f}%") | |
st.write(f"🛠️ Heuristic: {heuristic_ai*100:.1f}%") | |