Spaces:

dejanseo
/

ai-detection-small

Running

App Files Files Community

dejanseo commited on 25 days ago

Commit

4db9ce2

verified ·

1 Parent(s): 0598948

Upload 3 files

Browse files

Files changed (3) hide show

.streamlit/config.toml +7 -0
app.py +203 -0
requirements.txt +5 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,7 @@

+[theme]
+base = "light"
+primaryColor = "#4a90e2"
+backgroundColor = "#ffffff"
+secondaryBackgroundColor = "#f0f2f6"
+textColor = "#000000"
+font = "roboto"

app.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import streamlit as st
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import re
+import logging # Optional: Add logging for better debugging
+# Set up logging (optional but helpful)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Set the page configuration
+st.set_page_config(
+    page_title="AI Article Detection by DEJAN",
+    page_icon="🧠",
+    layout="wide"
+)
+# Logo as provided
+st.logo(
+    image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
+    link="https://dejan.ai/",
+    # size="large" # 'size' is not a valid argument for st.logo as of Streamlit 1.34 - remove or adjust if needed
+)
+# Font styling
+st.markdown("""
+<link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
+<style>
+    html, body, [class*="css"] {
+        font-family: 'Roboto', sans-serif;
+    }
+</style>
+""", unsafe_allow_html=True)
+@st.cache_resource # Cache the model and tokenizer to avoid reloading on every interaction
+def load_model_and_tokenizer(model_name):
+    """Loads the model and tokenizer."""
+    logger.info(f"Loading tokenizer: {model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Determine device
+    device_type = "cuda" if torch.cuda.is_available() else "cpu"
+    # Use bfloat16 if available on CUDA for potential speedup/memory saving, else float32
+    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
+    logger.info(f"Using device: {device_type} with dtype: {dtype}")
+    logger.info(f"Loading model: {model_name}")
+    # Load model onto CPU first, then move to target device
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        torch_dtype=dtype # Use the determined dtype
+        # Removed device_map="auto"
+    )
+    logger.info("Moving model to target device...")
+    model.to(torch.device(device_type)) # Move the entire model to the target device
+    model.eval() # Set model to evaluation mode
+    logger.info("Model loaded successfully.")
+    return tokenizer, model, torch.device(device_type)
+# Load model and tokenizer using the cached function
+MODEL_NAME = "dejanseo/ai-detection-base"
+try:
+    tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
+except Exception as e:
+    st.error(f"Error loading model: {e}")
+    logger.error(f"Failed to load model or tokenizer: {e}", exc_info=True)
+    st.stop() # Stop execution if model loading fails
+# Static settings
+LABELS = ["AI Content", "Human Content"]
+COLORS = ["#ffe5e5", "#e6ffe6"]  # light red, light green
+# Regex-based sentence splitter (improved slightly for robustness)
+def sent_tokenize(text):
+    # Split by '.', '!', '?' followed by space(s) or end of string
+    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+    # Filter out empty strings that might result from splitting
+    return [s for s in sentences if s]
+def split_into_chunks(text, tokenizer, max_length=512):
+    sentences = sent_tokenize(text)
+    if not sentences:
+        return [] # Handle empty input after tokenization
+    chunks, current_chunk_sentences, current_len = [], [], 0
+    max_tokens = max_length - 2 # Account for [CLS] and [SEP] tokens
+    for sent in sentences:
+        # Use tokenizer.encode to get accurate token count (more reliable than tokenize)
+        token_ids = tokenizer.encode(sent, add_special_tokens=False)
+        token_len = len(token_ids)
+        if token_len > max_tokens:
+            # Sentence is too long even by itself, handle appropriately
+            # Option 1: Truncate the sentence (simplest)
+            logger.warning(f"Sentence truncated as it exceeds max_length: '{sent[:100]}...'")
+            truncated_sent = tokenizer.decode(token_ids[:max_tokens])
+            # If there was a previous chunk, add it first
+            if current_chunk_sentences:
+                 chunks.append(" ".join(current_chunk_sentences))
+            chunks.append(truncated_sent) # Add the single truncated sentence as its own chunk
+            current_chunk_sentences, current_len = [], 0 # Reset chunk
+            continue # Move to the next sentence
+        if current_len + token_len <= max_tokens:
+            current_chunk_sentences.append(sent)
+            current_len += token_len
+        else:
+            # Current chunk is full, finalize it
+            if current_chunk_sentences:
+                chunks.append(" ".join(current_chunk_sentences))
+            # Start a new chunk with the current sentence
+            current_chunk_sentences = [sent]
+            current_len = token_len
+    # Add the last remaining chunk
+    if current_chunk_sentences:
+        chunks.append(" ".join(current_chunk_sentences))
+    return chunks
+# --- UI ---
+st.title("AI Article Detection")
+text = st.text_area("Enter text to classify", height=150, placeholder="Paste your text here...")
+if st.button("Classify", type="primary"):
+    if not text or not text.strip():
+        st.warning("Please enter some text.")
+    else:
+        with st.spinner("Analyzing... Please wait."):
+            try:
+                # Split text using the tokenizer reference
+                chunks = split_into_chunks(text, tokenizer, max_length=model.config.max_position_embeddings)
+                logger.info(f"Split text into {len(chunks)} chunks.")
+                if not chunks:
+                     st.warning("Could not process the input text (perhaps it's too short or contains only delimiters?).")
+                     st.stop()
+                # Tokenize chunks and move tensors to the correct device
+                inputs = tokenizer(
+                    chunks,
+                    return_tensors="pt",
+                    padding=True,         # Pad sequences to the longest in the batch
+                    truncation=True,      # Truncate sequences longer than max_length
+                    max_length=model.config.max_position_embeddings # Use model's max length
+                ).to(device) # Move inputs to the same device as the model
+                # Perform inference
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    logits = outputs.logits
+                    # Ensure probabilities are calculated on CPU if needed for aggregation later
+                    probs = F.softmax(logits, dim=-1).cpu() # Move probs to CPU
+                    preds = torch.argmax(probs, dim=-1) # Argmax on CPU probabilities
+                # Process results
+                chunk_results = []
+                for i, chunk in enumerate(chunks):
+                    pred_index = preds[i].item() # Get prediction index for this chunk
+                    chunk_results.append({
+                        "text": chunk,
+                        "label": LABELS[pred_index],
+                        "color": COLORS[pred_index],
+                        "conf": probs[i, pred_index].item() * 100, # Get confidence for the predicted class
+                    })
+                # Calculate overall prediction based on average probability across chunks
+                if probs.numel() > 0: # Check if probs tensor is not empty
+                    avg_probs = torch.mean(probs, dim=0) # Average probabilities across the batch dimension
+                    final_class_index = torch.argmax(avg_probs).item()
+                    final_label = LABELS[final_class_index]
+                    final_conf = avg_probs[final_class_index].item() * 100
+                    # Display final prediction
+                    st.subheader("📊 Final Prediction")
+                    st.markdown(
+                        f"<div style='background-color:{COLORS[final_class_index]}; padding:1rem; border-radius:0.5rem; border: 1px solid #ccc;'>"
+                        f"Based on the analysis, the text is most likely: <b>{final_label}</b> (Confidence: {final_conf:.1f}%)</div>",
+                        unsafe_allow_html=True
+                    )
+                else:
+                    st.warning("Could not generate predictions for the provided text.")
+                # Display per-chunk predictions in an expander
+                with st.expander("See per-chunk predictions and confidence"):
+                    if chunk_results:
+                        for result in chunk_results:
+                            st.markdown(
+                                f"<div title='Confidence: {result['conf']:.1f}%' "
+                                f"style='background-color:{result['color']}; padding:0.75rem; margin-bottom:0.5rem; border-radius:0.5rem; border: 1px solid #ddd;'>"
+                                f"<i>({result['label']} - {result['conf']:.1f}%)</i><br>{result['text']}</div>",
+                                unsafe_allow_html=True
+                            )
+                    else:
+                         st.write("No chunk predictions were generated.")
+            except Exception as e:
+                st.error(f"An error occurred during analysis: {e}")
+                logger.error(f"Analysis failed: {e}", exc_info=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+torch
+transformers
+nltk
+accelerate