submission-template

Sleeping

App Files Files Community

Zen0 commited on Jan 11

Commit

daf1822

verified ·

1 Parent(s): aee4009

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +47 -69

tasks/text.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from fastapi import APIRouter
 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
-import torch
 import numpy as np
 from .utils.evaluation import TextEvaluationRequest
@@ -11,9 +12,27 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
 router = APIRouter()
-DESCRIPTION = "FrugalDisinfoHunter Model"
 ROUTE = "/text"
 @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
 async def evaluate_text(request: TextEvaluationRequest):
     """
@@ -34,81 +53,40 @@ async def evaluate_text(request: TextEvaluationRequest):
         "7_fossil_fuels_needed": 7
     }
-    # Load and prepare the dataset
-    dataset = load_dataset(request.dataset_name)
-    # Convert string labels to integers
-    dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
-    # Split dataset
-    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
-    test_dataset = train_test["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
     try:
-        # Model configuration
-        model_name = "google/mobilebert-uncased"  # Base model
-        local_weights = "model/model.pt"  # Path to our trained weights
-        BATCH_SIZE = 32
-        MAX_LENGTH = 256  # Increased from 128
-        # Initialize tokenizer and model
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSequenceClassification.from_pretrained(
-            model_name,
-            num_labels=8,
-            problem_type="single_label_classification"
         )
-        # Load our trained weights
-        try:
-            state_dict = torch.load(local_weights, map_location='cpu')
-            model.load_state_dict(state_dict)
-        except Exception as e:
-            print(f"Error loading weights: {e}")
-            # Continue with base model if weights fail to load
-            pass
-        # Move model to appropriate device
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        model = model.to(device)
-        model.eval()  # Set to evaluation mode
-        # Get test texts and process in batches
-        test_texts = test_dataset["quote"]
-        predictions = []
-        # Process in batches
-        for i in range(0, len(test_texts), BATCH_SIZE):
-            # Clear CUDA cache if using GPU
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            batch_texts = test_texts[i:i + BATCH_SIZE]
-            # Tokenize with padding and attention masks
-            inputs = tokenizer(
-                batch_texts,
-                padding=True,
-                truncation=True,
-                max_length=MAX_LENGTH,
-                return_tensors="pt"
-            )
-            # Move inputs to device
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            # Run inference with no gradient computation
-            with torch.no_grad():
-                outputs = model(**inputs)
-                batch_preds = torch.argmax(outputs.logits, dim=1)
-                predictions.extend(batch_preds.cpu().numpy())
         # Get true labels
-        true_labels = test_dataset['label']
         # Stop tracking emissions
         emissions_data = tracker.stop_task()

 from fastapi import APIRouter
 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
 import numpy as np
 from .utils.evaluation import TextEvaluationRequest
 router = APIRouter()
+DESCRIPTION = "Climate Disinformation Detection - TF-IDF + LogReg"
 ROUTE = "/text"
+def create_pipeline():
+    """Create an efficient text classification pipeline"""
+    return Pipeline([
+        ('tfidf', TfidfVectorizer(
+            max_features=10000,  # Limit features for efficiency
+            ngram_range=(1, 2),  # Use unigrams and bigrams
+            stop_words='english',
+            min_df=2,  # Remove very rare terms
+            max_df=0.95  # Remove very common terms
+        )),
+        ('classifier', LogisticRegression(
+            C=1.0,
+            multi_class='multinomial',
+            max_iter=200,
+            n_jobs=-1  # Use all CPU cores
+        ))
+    ])
 @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
 async def evaluate_text(request: TextEvaluationRequest):
     """
         "7_fossil_fuels_needed": 7
     }
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
     try:
+        # Load and prepare the dataset
+        dataset = load_dataset(request.dataset_name)
+        # Convert string labels to integers
+        dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
+        # Split dataset
+        train_test = dataset["train"].train_test_split(
+            test_size=request.test_size,
+            seed=request.test_seed
         )
+        train_dataset = train_test["train"]
+        test_dataset = train_test["test"]
+        # Create and train pipeline
+        pipeline = create_pipeline()
+        # Train the model
+        pipeline.fit(
+            train_dataset["quote"],
+            train_dataset["label"]
+        )
+        # Make predictions
+        predictions = pipeline.predict(test_dataset["quote"])
         # Get true labels
+        true_labels = test_dataset["label"]
         # Stop tracking emissions
         emissions_data = tracker.stop_task()