Zen0 commited on
Commit
daf1822
·
verified ·
1 Parent(s): aee4009

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +47 -69
tasks/text.py CHANGED
@@ -1,9 +1,10 @@
1
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
  from fastapi import APIRouter
3
  from datetime import datetime
4
  from datasets import load_dataset
5
  from sklearn.metrics import accuracy_score
6
- import torch
 
 
7
  import numpy as np
8
 
9
  from .utils.evaluation import TextEvaluationRequest
@@ -11,9 +12,27 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
11
 
12
  router = APIRouter()
13
 
14
- DESCRIPTION = "FrugalDisinfoHunter Model"
15
  ROUTE = "/text"
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
18
  async def evaluate_text(request: TextEvaluationRequest):
19
  """
@@ -34,81 +53,40 @@ async def evaluate_text(request: TextEvaluationRequest):
34
  "7_fossil_fuels_needed": 7
35
  }
36
 
37
- # Load and prepare the dataset
38
- dataset = load_dataset(request.dataset_name)
39
-
40
- # Convert string labels to integers
41
- dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
42
-
43
- # Split dataset
44
- train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
45
- test_dataset = train_test["test"]
46
-
47
  # Start tracking emissions
48
  tracker.start()
49
  tracker.start_task("inference")
50
 
51
  try:
52
- # Model configuration
53
- model_name = "google/mobilebert-uncased" # Base model
54
- local_weights = "model/model.pt" # Path to our trained weights
55
- BATCH_SIZE = 32
56
- MAX_LENGTH = 256 # Increased from 128
57
-
58
- # Initialize tokenizer and model
59
- tokenizer = AutoTokenizer.from_pretrained(model_name)
60
- model = AutoModelForSequenceClassification.from_pretrained(
61
- model_name,
62
- num_labels=8,
63
- problem_type="single_label_classification"
64
  )
65
 
66
- # Load our trained weights
67
- try:
68
- state_dict = torch.load(local_weights, map_location='cpu')
69
- model.load_state_dict(state_dict)
70
- except Exception as e:
71
- print(f"Error loading weights: {e}")
72
- # Continue with base model if weights fail to load
73
- pass
74
-
75
- # Move model to appropriate device
76
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
77
- model = model.to(device)
78
- model.eval() # Set to evaluation mode
79
-
80
- # Get test texts and process in batches
81
- test_texts = test_dataset["quote"]
82
- predictions = []
83
-
84
- # Process in batches
85
- for i in range(0, len(test_texts), BATCH_SIZE):
86
- # Clear CUDA cache if using GPU
87
- if torch.cuda.is_available():
88
- torch.cuda.empty_cache()
89
-
90
- batch_texts = test_texts[i:i + BATCH_SIZE]
91
-
92
- # Tokenize with padding and attention masks
93
- inputs = tokenizer(
94
- batch_texts,
95
- padding=True,
96
- truncation=True,
97
- max_length=MAX_LENGTH,
98
- return_tensors="pt"
99
- )
100
-
101
- # Move inputs to device
102
- inputs = {k: v.to(device) for k, v in inputs.items()}
103
-
104
- # Run inference with no gradient computation
105
- with torch.no_grad():
106
- outputs = model(**inputs)
107
- batch_preds = torch.argmax(outputs.logits, dim=1)
108
- predictions.extend(batch_preds.cpu().numpy())
109
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  # Get true labels
111
- true_labels = test_dataset['label']
112
 
113
  # Stop tracking emissions
114
  emissions_data = tracker.stop_task()
 
 
1
  from fastapi import APIRouter
2
  from datetime import datetime
3
  from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.linear_model import LogisticRegression
7
+ from sklearn.pipeline import Pipeline
8
  import numpy as np
9
 
10
  from .utils.evaluation import TextEvaluationRequest
 
12
 
13
  router = APIRouter()
14
 
15
+ DESCRIPTION = "Climate Disinformation Detection - TF-IDF + LogReg"
16
  ROUTE = "/text"
17
 
18
+ def create_pipeline():
19
+ """Create an efficient text classification pipeline"""
20
+ return Pipeline([
21
+ ('tfidf', TfidfVectorizer(
22
+ max_features=10000, # Limit features for efficiency
23
+ ngram_range=(1, 2), # Use unigrams and bigrams
24
+ stop_words='english',
25
+ min_df=2, # Remove very rare terms
26
+ max_df=0.95 # Remove very common terms
27
+ )),
28
+ ('classifier', LogisticRegression(
29
+ C=1.0,
30
+ multi_class='multinomial',
31
+ max_iter=200,
32
+ n_jobs=-1 # Use all CPU cores
33
+ ))
34
+ ])
35
+
36
  @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
37
  async def evaluate_text(request: TextEvaluationRequest):
38
  """
 
53
  "7_fossil_fuels_needed": 7
54
  }
55
 
 
 
 
 
 
 
 
 
 
 
56
  # Start tracking emissions
57
  tracker.start()
58
  tracker.start_task("inference")
59
 
60
  try:
61
+ # Load and prepare the dataset
62
+ dataset = load_dataset(request.dataset_name)
63
+
64
+ # Convert string labels to integers
65
+ dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
66
+
67
+ # Split dataset
68
+ train_test = dataset["train"].train_test_split(
69
+ test_size=request.test_size,
70
+ seed=request.test_seed
 
 
71
  )
72
 
73
+ train_dataset = train_test["train"]
74
+ test_dataset = train_test["test"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Create and train pipeline
77
+ pipeline = create_pipeline()
78
+
79
+ # Train the model
80
+ pipeline.fit(
81
+ train_dataset["quote"],
82
+ train_dataset["label"]
83
+ )
84
+
85
+ # Make predictions
86
+ predictions = pipeline.predict(test_dataset["quote"])
87
+
88
  # Get true labels
89
+ true_labels = test_dataset["label"]
90
 
91
  # Stop tracking emissions
92
  emissions_data = tracker.stop_task()