Spaces:

ApsidalSolid4
/

CITProjectAIDetector

Running

App Files Files Community

ApsidalSolid4 commited on Feb 17

Commit

7b9d8b2

verified ·

1 Parent(s): 1351283

Update app.py

Browse files

Files changed (1) hide show

app.py +206 -121

app.py CHANGED Viewed

@@ -1,31 +1,34 @@
 import torch
-import numpy as np
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch.nn.functional as F
 import spacy
 from typing import List, Dict
 import logging
 import os
-import gradio as gr
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Constants
 MAX_LENGTH = 512
 MODEL_NAME = "microsoft/deberta-v3-small"
 WINDOW_SIZE = 17
 WINDOW_OVERLAP = 2
 CONFIDENCE_THRESHOLD = 0.65
-class TextWindowProcessor:
     def __init__(self):
         try:
             self.nlp = spacy.load("en_core_web_sm")
         except OSError:
             logger.info("Downloading spacy model...")
-            spacy.cli.download("en_core_web_sm")
             self.nlp = spacy.load("en_core_web_sm")
         if 'sentencizer' not in self.nlp.pipe_names:
@@ -38,16 +41,29 @@ class TextWindowProcessor:
         doc = self.nlp(text)
         return [str(sent).strip() for sent in doc.sents]
     def create_centered_windows(self, sentences: List[str], window_size: int) -> tuple[List[str], List[List[int]]]:
         """Create windows centered around each sentence for detailed analysis."""
         windows = []
         window_sentence_indices = []
         for i in range(len(sentences)):
             half_window = window_size // 2
             start_idx = max(0, i - half_window)
             end_idx = min(len(sentences), i + half_window + 1)
             if start_idx == 0:
                 end_idx = min(len(sentences), window_size)
             elif end_idx == len(sentences):
@@ -59,60 +75,109 @@ class TextWindowProcessor:
         return windows, window_sentence_indices
-class TextClassifier:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model_name = MODEL_NAME
         self.tokenizer = None
         self.model = None
-        self.processor = TextWindowProcessor()
-        self.initialize_model()
-    def initialize_model(self):
-        """Initialize the model and tokenizer."""
-        logger.info("Initializing model and tokenizer...")
-        from transformers import DebertaV2TokenizerFast
-        # Try to load tokenizer directly from the Hub
-        self.tokenizer = DebertaV2TokenizerFast.from_pretrained(
-            self.model_name,
-            model_max_length=MAX_LENGTH,
-            use_fast=False,
-            from_slow=True
-        )
-        # Initialize the model as before
         self.model = AutoModelForSequenceClassification.from_pretrained(
-            self.model_name,
             num_labels=2
         ).to(self.device)
-        # Your existing model loading code
-        model_path = "model_20250209_184929_acc1.0000.pt"
-        if os.path.exists(model_path):
-            logger.info(f"Loading custom model from {model_path}")
             checkpoint = torch.load(model_path, map_location=self.device)
             self.model.load_state_dict(checkpoint['model_state_dict'])
-        else:
-            logger.warning("Custom model file not found. Using base model.")
         self.model.eval()
-    def predict_with_sentence_scores(self, text: str) -> Dict:
-        """Predict with sentence-level granularity using overlapping windows."""
-        if not text.strip():
             return {
-                'sentence_predictions': [],
-                'highlighted_text': '',
-                'full_text': '',
-                'overall_prediction': {
-                    'prediction': 'unknown',
-                    'confidence': 0.0,
-                    'num_sentences': 0
-                }
             }
         sentences = self.processor.split_into_sentences(text)
         if not sentences:
             return {}
@@ -125,10 +190,9 @@ class TextClassifier:
         sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
         # Process windows in batches to save memory
-        batch_size = 16
-        for i in range(0, len(windows), batch_size):
-            batch_windows = windows[i:i + batch_size]
-            batch_indices = window_sentence_indices[i:i + batch_size]
             inputs = self.tokenizer(
                 batch_windows,
@@ -142,12 +206,18 @@ class TextClassifier:
                 outputs = self.model(**inputs)
                 probs = F.softmax(outputs.logits, dim=-1)
                 for window_idx, indices in enumerate(batch_indices):
                     for sent_idx in indices:
                         sentence_appearances[sent_idx] += 1
                         sentence_scores[sent_idx]['human_prob'] += probs[window_idx][1].item()
                         sentence_scores[sent_idx]['ai_prob'] += probs[window_idx][0].item()
         # Average the scores and create final sentence-level predictions
         sentence_predictions = []
         for i in range(len(sentences)):
@@ -162,41 +232,46 @@ class TextClassifier:
                     'confidence': max(human_prob, ai_prob)
                 })
-        # Generate analysis outputs
         return {
             'sentence_predictions': sentence_predictions,
-            'highlighted_text': self.format_predictions_html(sentence_predictions),
             'full_text': text,
-            'overall_prediction': self.aggregate_predictions(sentence_predictions)
         }
-    def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
-        """Format predictions as HTML with color-coding."""
-        html_parts = []
         for pred in sentence_predictions:
             sentence = pred['sentence']
             confidence = pred['confidence']
             if confidence >= CONFIDENCE_THRESHOLD:
                 if pred['prediction'] == 'human':
-                    color = "#90EE90"  # Light green
                 else:
-                    color = "#FFB6C6"  # Light red
             else:
                 if pred['prediction'] == 'human':
-                    color = "#E8F5E9"  # Very light green
                 else:
-                    color = "#FFEBEE"  # Very light red
-            html_parts.append(f'<span style="background-color: {color};">{sentence}</span>')
-        return " ".join(html_parts)
-    def aggregate_predictions(self, predictions: List[Dict]) -> Dict:
-        """Aggregate predictions from multiple sentences into a single prediction."""
         if not predictions:
             return {
                 'prediction': 'unknown',
                 'confidence': 0.0,
                 'num_sentences': 0
@@ -210,63 +285,73 @@ class TextClassifier:
         avg_ai_prob = total_ai_prob / num_sentences
         return {
             'prediction': 'human' if avg_human_prob > avg_ai_prob else 'ai',
             'confidence': max(avg_human_prob, avg_ai_prob),
             'num_sentences': num_sentences
         }
-def analyze_text(text: str, classifier: TextClassifier) -> tuple:
-    """Analyze text and return formatted results for Gradio interface."""
-    # Get predictions
-    analysis = classifier.predict_with_sentence_scores(text)
-    # Format sentence-by-sentence analysis
-    detailed_analysis = []
-    for pred in analysis['sentence_predictions']:
-        confidence = pred['confidence'] * 100
-        detailed_analysis.append(f"Sentence: {pred['sentence']}")
-        detailed_analysis.append(f"Prediction: {pred['prediction'].upper()}")
-        detailed_analysis.append(f"Confidence: {confidence:.1f}%")
-        detailed_analysis.append("-" * 50)
-    # Format overall prediction
-    final_pred = analysis['overall_prediction']
-    overall_result = f"""
-    FINAL PREDICTION: {final_pred['prediction'].upper()}
-    Overall confidence: {final_pred['confidence']*100:.1f}%
-    Number of sentences analyzed: {final_pred['num_sentences']}
-    """
-    return (
-        analysis['highlighted_text'],
-        "\n".join(detailed_analysis),
-        overall_result
-    )
-# Initialize the classifier globally
-classifier = TextClassifier()
-# Create Gradio interface
-demo = gr.Interface(
-    fn=lambda text: analyze_text(text, classifier),
-    inputs=gr.Textbox(
-        lines=8,
-        placeholder="Enter text to analyze...",
-        label="Input Text"
-    ),
-    outputs=[
-        gr.HTML(label="Highlighted Analysis"),
-        gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10),
-        gr.Textbox(label="Overall Result", lines=4)
-    ],
-    title="AI Text Detector",
-    description="Analyze text to detect if it was written by a human or AI. Text is analyzed sentence by sentence, with color coding indicating the prediction confidence.",
-    examples=[
-        ["This is a sample text written by a human. It contains multiple sentences with different ideas. The analysis will show how each sentence is classified. This demonstrates the AI detection capabilities."],
-    ],
-    allow_flagging="never"
-)
-# Launch the interface
 if __name__ == "__main__":
-    demo.launch(share=True)

 import torch
 import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import spacy
 from typing import List, Dict
 import logging
 import os
+from colorama import init, Fore, Back, Style
+# Initialize colorama for colored terminal output
+init()
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Constants - matching original implementations
 MAX_LENGTH = 512
 MODEL_NAME = "microsoft/deberta-v3-small"
 WINDOW_SIZE = 17
 WINDOW_OVERLAP = 2
 CONFIDENCE_THRESHOLD = 0.65
+BATCH_SIZE = 16  # Matching original batch size
+class TextProcessor:
     def __init__(self):
         try:
             self.nlp = spacy.load("en_core_web_sm")
         except OSError:
             logger.info("Downloading spacy model...")
+            os.system("python -m spacy download en_core_web_sm")
             self.nlp = spacy.load("en_core_web_sm")
         if 'sentencizer' not in self.nlp.pipe_names:
         doc = self.nlp(text)
         return [str(sent).strip() for sent in doc.sents]
+    def create_windows(self, sentences: List[str], window_size: int, overlap: int) -> List[str]:
+        if len(sentences) < window_size:
+            return [" ".join(sentences)]
+        windows = []
+        stride = window_size - overlap
+        for i in range(0, len(sentences) - window_size + 1, stride):
+            window = sentences[i:i + window_size]
+            windows.append(" ".join(window))
+        return windows
     def create_centered_windows(self, sentences: List[str], window_size: int) -> tuple[List[str], List[List[int]]]:
         """Create windows centered around each sentence for detailed analysis."""
         windows = []
         window_sentence_indices = []
         for i in range(len(sentences)):
+            # Calculate window boundaries centered on current sentence
             half_window = window_size // 2
             start_idx = max(0, i - half_window)
             end_idx = min(len(sentences), i + half_window + 1)
+            # Adjust window if we're near the edges
             if start_idx == 0:
                 end_idx = min(len(sentences), window_size)
             elif end_idx == len(sentences):
         return windows, window_sentence_indices
+class AITextDetector:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.processor = TextProcessor()
         self.tokenizer = None
         self.model = None
+        self._initialize_model()
+    def _initialize_model(self):
+        """Initialize model and tokenizer."""
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
         self.model = AutoModelForSequenceClassification.from_pretrained(
+            MODEL_NAME,
             num_labels=2
         ).to(self.device)
+        try:
+            model_path = "model_20250209_184929_acc1.0000.pt"
             checkpoint = torch.load(model_path, map_location=self.device)
             self.model.load_state_dict(checkpoint['model_state_dict'])
+            logger.info(f"Loaded model from {model_path}")
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            raise
+    def quick_scan(self, text: str) -> Dict:
+        """
+        Quick scan implementation matching the second original program's predict method.
+        """
+        if self.model is None or self.tokenizer is None:
+            self._initialize_model()
         self.model.eval()
+        sentences = self.processor.split_into_sentences(text)
+        windows = self.processor.create_windows(sentences, WINDOW_SIZE, WINDOW_OVERLAP)
+        predictions = []
+        # Process windows in batches to save memory
+        for i in range(0, len(windows), BATCH_SIZE):
+            batch_windows = windows[i:i + BATCH_SIZE]
+            inputs = self.tokenizer(
+                batch_windows,
+                truncation=True,
+                padding=True,
+                max_length=MAX_LENGTH,
+                return_tensors="pt"
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                probs = F.softmax(outputs.logits, dim=-1)
+                for idx, window in enumerate(batch_windows):
+                    prediction = {
+                        'window': window,
+                        'human_prob': probs[idx][1].item(),
+                        'ai_prob': probs[idx][0].item(),
+                        'prediction': 'human' if probs[idx][1] > probs[idx][0] else 'ai'
+                    }
+                    predictions.append(prediction)
+            # Clear memory
+            del inputs, outputs, probs
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        return self._aggregate_quick_predictions(predictions)
+    def _aggregate_quick_predictions(self, predictions: List[Dict]) -> Dict:
+        """
+        Aggregate predictions matching the second original program.
+        """
+        if not predictions:
             return {
+                'human_prob': 0.0,
+                'ai_prob': 0.0,
+                'prediction': 'unknown',
+                'confidence': 0.0,
+                'num_windows': 0
             }
+        avg_human_prob = sum(p['human_prob'] for p in predictions) / len(predictions)
+        avg_ai_prob = sum(p['ai_prob'] for p in predictions) / len(predictions)
+        return {
+            'human_prob': avg_human_prob,
+            'ai_prob': avg_ai_prob,
+            'prediction': 'human' if avg_human_prob > avg_ai_prob else 'ai',
+            'confidence': max(avg_human_prob, avg_ai_prob),
+            'num_windows': len(predictions)
+        }
+    def detailed_scan(self, text: str) -> Dict:
+        """
+        Detailed scan implementation matching the first original program's
+        predict_with_sentence_scores method.
+        """
+        if self.model is None or self.tokenizer is None:
+            self._initialize_model()
+        self.model.eval()
         sentences = self.processor.split_into_sentences(text)
         if not sentences:
             return {}
         sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
         # Process windows in batches to save memory
+        for i in range(0, len(windows), BATCH_SIZE):
+            batch_windows = windows[i:i + BATCH_SIZE]
+            batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
             inputs = self.tokenizer(
                 batch_windows,
                 outputs = self.model(**inputs)
                 probs = F.softmax(outputs.logits, dim=-1)
+                # Attribute window predictions back to individual sentences
                 for window_idx, indices in enumerate(batch_indices):
                     for sent_idx in indices:
                         sentence_appearances[sent_idx] += 1
                         sentence_scores[sent_idx]['human_prob'] += probs[window_idx][1].item()
                         sentence_scores[sent_idx]['ai_prob'] += probs[window_idx][0].item()
+            # Clear memory
+            del inputs, outputs, probs
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
         # Average the scores and create final sentence-level predictions
         sentence_predictions = []
         for i in range(len(sentences)):
                     'confidence': max(human_prob, ai_prob)
                 })
+        # Generate highlighted text output
+        highlighted_text = self._generate_highlighted_text(sentence_predictions)
         return {
             'sentence_predictions': sentence_predictions,
+            'highlighted_text': highlighted_text,
             'full_text': text,
+            'overall_prediction': self._aggregate_detailed_predictions(sentence_predictions)
         }
+    def _generate_highlighted_text(self, sentence_predictions: List[Dict]) -> str:
+        """Generate colored text output with highlighting based on predictions."""
+        highlighted_parts = []
         for pred in sentence_predictions:
             sentence = pred['sentence']
             confidence = pred['confidence']
             if confidence >= CONFIDENCE_THRESHOLD:
                 if pred['prediction'] == 'human':
+                    highlighted_parts.append(f"{Back.GREEN}{sentence}{Style.RESET_ALL}")
                 else:
+                    highlighted_parts.append(f"{Back.RED}{sentence}{Style.RESET_ALL}")
             else:
+                # Low confidence predictions get a lighter highlight
                 if pred['prediction'] == 'human':
+                    highlighted_parts.append(f"{Back.LIGHTGREEN_EX}{sentence}{Style.RESET_ALL}")
                 else:
+                    highlighted_parts.append(f"{Back.LIGHTRED_EX}{sentence}{Style.RESET_ALL}")
+        return " ".join(highlighted_parts)
+    def _aggregate_detailed_predictions(self, predictions: List[Dict]) -> Dict:
+        """
+        Aggregate predictions matching the first original program.
+        """
         if not predictions:
             return {
+                'human_prob': 0.0,
+                'ai_prob': 0.0,
                 'prediction': 'unknown',
                 'confidence': 0.0,
                 'num_sentences': 0
         avg_ai_prob = total_ai_prob / num_sentences
         return {
+            'human_prob': avg_human_prob,
+            'ai_prob': avg_ai_prob,
             'prediction': 'human' if avg_human_prob > avg_ai_prob else 'ai',
             'confidence': max(avg_human_prob, avg_ai_prob),
             'num_sentences': num_sentences
         }
+def main():
+    try:
+        detector = AITextDetector()
+        while True:
+            print("\nAI Text Detector")
+            print("===============")
+            print("1. Quick Scan")
+            print("2. Detailed Scan")
+            print("3. Exit")
+            choice = input("\nSelect an option (1-3): ").strip()
+            if choice == "3":
+                break
+            if choice not in ["1", "2"]:
+                print("Invalid choice. Please select 1, 2, or 3.")
+                continue
+            text = input("\nEnter text to analyze: ").strip()
+            if choice == "1":
+                # Quick scan
+                result = detector.quick_scan(text)
+                print("\nQuick Scan Results:")
+                print("==================")
+                print(f"Prediction: {result['prediction'].upper()}")
+                print(f"Confidence: {result['confidence']*100:.1f}%")
+                print(f"Human Probability: {result['human_prob']*100:.1f}%")
+                print(f"AI Probability: {result['ai_prob']*100:.1f}%")
+                print(f"Number of windows analyzed: {result['num_windows']}")
+            else:
+                # Detailed scan
+                result = detector.detailed_scan(text)
+                print("\nDetailed Analysis:")
+                print("=================")
+                # Print sentence-level predictions
+                for pred in result['sentence_predictions']:
+                    confidence = pred['confidence'] * 100
+                    print(f"\nSentence: {pred['sentence']}")
+                    print(f"Prediction: {pred['prediction'].upper()}")
+                    print(f"Confidence: {confidence:.1f}%")
+                # Print highlighted text
+                print("\nHighlighted Text Analysis:")
+                print("=========================")
+                print(result['highlighted_text'])
+                # Print final prediction
+                final_pred = result['overall_prediction']
+                print(f"\nFINAL PREDICTION: {final_pred['prediction'].upper()}")
+                print(f"Overall confidence: {final_pred['confidence']*100:.1f}%")
+                print(f"Number of sentences analyzed: {final_pred['num_sentences']}")
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        raise
 if __name__ == "__main__":
+    main()