Spaces:
Running
Running
File size: 5,093 Bytes
d73c320 64170a0 4db9ce2 8582a8e 964a94b 8582a8e d6bd636 94a3584 64170a0 d73c320 64170a0 d73c320 8582a8e d73c320 8582a8e 4db9ce2 bd98692 4db9ce2 8582a8e 4db9ce2 68d1553 8582a8e 68d1553 8582a8e 68d1553 4db9ce2 1b73e0f 4db9ce2 8582a8e 68d1553 4db9ce2 8582a8e 4db9ce2 8582a8e 4db9ce2 8582a8e 4db9ce2 8582a8e 964a94b f6a4aa5 964a94b 8582a8e c2f4b21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import os
import json
import ast
import streamlit as st
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import math
import logging
import pandas as pd
st.set_page_config(
page_title="AI Article Detection by DEJAN",
page_icon="🧠",
layout="wide"
)
st.logo(
image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
link="https://dejan.ai/",
)
# --- Load heuristic weights from environment secrets, with JSON→Python fallback ---
@st.cache_resource
def load_heuristic_weights():
def _load(env_key):
raw = os.environ[env_key]
try:
return json.loads(raw)
except json.JSONDecodeError:
return ast.literal_eval(raw)
ai = _load("AI_WEIGHTS_JSON")
og = _load("OG_WEIGHTS_JSON")
return ai, og
AI_WEIGHTS, OG_WEIGHTS = load_heuristic_weights()
SIGMOID_K = 0.5
def tokenize(text):
return re.findall(r'\b[a-z]{2,}\b', text.lower())
def classify_text_likelihood(text: str) -> float:
tokens = tokenize(text)
if not tokens:
return 0.5
ai_score = og_score = matched = 0
for t in tokens:
aw = AI_WEIGHTS.get(t, 0)
ow = OG_WEIGHTS.get(t, 0)
if aw or ow:
matched += 1
ai_score += aw
og_score += ow
if matched == 0:
return 0.5
net = ai_score - og_score
return 1 / (1 + math.exp(-SIGMOID_K * net))
# --- Logging & Streamlit setup ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
st.markdown("""
<link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
<style>
html, body, [class*="css"] {
font-family: 'Roboto', sans-serif;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def load_model_and_tokenizer(model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
model.to(device).eval()
return tokenizer, model, device
MODEL_NAME = "dejanseo/ai-cop"
try:
tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
except Exception as e:
st.error(f"Error loading model: {e}")
logger.error(f"Failed to load model: {e}", exc_info=True)
st.stop()
def sent_tokenize(text):
return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s]
st.title("AI Article Detection")
text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…")
if st.button("Classify", type="primary"):
if not text.strip():
st.warning("Please enter some text.")
else:
with st.spinner("Analyzing…"):
sentences = sent_tokenize(text)
if not sentences:
st.warning("No sentences detected.")
st.stop()
inputs = tokenizer(
sentences,
return_tensors="pt",
padding=True,
truncation=True,
max_length=model.config.max_position_embeddings
).to(device)
with torch.no_grad():
logits = model(**inputs).logits
probs = F.softmax(logits, dim=-1).cpu()
preds = torch.argmax(probs, dim=-1).cpu()
# Create dataframe for sentences
sentences_data = []
for i, s in enumerate(sentences):
p = preds[i].item()
conf = probs[i, p].item()
label = "AI" if p == 0 else "Human"
sentences_data.append({
"sentence": s,
"classification": label,
"confidence": conf
})
# Display as dataframe with progress column
df = pd.DataFrame(sentences_data)
st.dataframe(
df,
column_config={
"sentence": st.column_config.TextColumn("Sentence"),
"classification": st.column_config.TextColumn("Classification"),
"confidence": st.column_config.ProgressColumn(
"Confidence",
help="Model's confidence in the classification",
format="%.2f",
min_value=0,
max_value=1,
),
},
hide_index=True,
)
avg = torch.mean(probs, dim=0)
model_ai = avg[0].item()
heuristic_ai = classify_text_likelihood(text)
combined = min(model_ai + heuristic_ai, 1.0)
st.subheader(f"⚖️ AI Likelihood: {combined*100:.1f}%")
st.write(f"🤖 Model: {model_ai*100:.1f}%")
st.write(f"🛠️ Heuristic: {heuristic_ai*100:.1f}%")
|