Spaces:

voxmenthe
/

imdb-sentiment-demo

Sleeping

App Files Files Community

voxmenthe commited on 13 days ago

Commit

be92e89

1 Parent(s): af23076

add progress bar to gradio space

Browse files

Files changed (2) hide show

app.py +22 -15
evaluation.py +24 -15

app.py CHANGED Viewed

@@ -102,22 +102,29 @@ def run_full_evaluation_gradio():
         test_dataloader_full = DataLoader(tokenized_imdb_test_full, batch_size=batch_size)
         yield "Dataset tokenized and DataLoader prepared. Starting model evaluation on the test set..."
-        # The 'evaluate' function from evaluation.py expects the dataloader to be potentially wrapped by tqdm
-        # based on how it was called in evaluation.py's main block.
-        # We will wrap it with tqdm here for consistency if evaluate function expects it.
-        # Note: tqdm progress here will go to console, not Gradio UI directly.
-        tqdm_dataloader = tqdm(test_dataloader_full, desc="Evaluating in App")
-        results = evaluate(model, tqdm_dataloader, device)
-        results_str = "--- Full Evaluation Results ---\n"
-        for key, value in results.items():
-            if isinstance(value, float):
-                results_str += f"{key.capitalize()}: {value:.4f}\n"
             else:
-                results_str += f"{key.capitalize()}: {value}\n"
-        results_str += "\nEvaluation finished."
-        yield results_str
     except Exception as e:
         import traceback

         test_dataloader_full = DataLoader(tokenized_imdb_test_full, batch_size=batch_size)
         yield "Dataset tokenized and DataLoader prepared. Starting model evaluation on the test set..."
+        # The 'evaluate' function from evaluation.py is now a generator.
+        # Iterate through its yielded updates and results.
+        final_results_str = ""
+        for update in evaluate(model, test_dataloader_full, device):
+            if isinstance(update, dict):
+                # This is the final results dictionary
+                results_str = "--- Full Evaluation Results ---\n"
+                for key, value in update.items():
+                    if isinstance(value, float):
+                        results_str += f"{key.capitalize()}: {value:.4f}\n"
+                    else:
+                        results_str += f"{key.capitalize()}: {value}\n"
+                results_str += "\nEvaluation finished."
+                final_results_str = results_str # Store to yield last
+                yield results_str # Optionally yield intermediate dict if needed, or just final string
+                break # Stop after getting the results dict
             else:
+                # This is a progress string
+                yield update
+        # Ensure the final formatted results string is yielded if not already (e.g., if loop broke early)
+        # However, the logic above should yield it before breaking.
+        # If evaluate could end without yielding a dict, this might be needed.
     except Exception as e:
         import traceback

evaluation.py CHANGED Viewed

@@ -10,9 +10,14 @@ def evaluate(model, dataloader, device):
     all_labels = []
     all_probs_for_auc = []
     total_loss = 0
     with torch.no_grad():
-        for batch in dataloader:
             # Move batch to device, ensure all model inputs are covered
             input_ids = batch['input_ids'].to(device)
             attention_mask = batch['attention_mask'].to(device)
@@ -49,15 +54,11 @@ def evaluate(model, dataloader, device):
             all_preds.extend(preds.cpu().numpy())
             all_labels.extend(labels.cpu().numpy())
-            if logits.shape[1] > 1:
-                probs = torch.softmax(logits, dim=1)[:, 1]
-                all_probs_for_auc.extend(probs.cpu().numpy())
-            else:
-                probs = torch.sigmoid(logits)
-                all_probs_for_auc.extend(probs.squeeze().cpu().numpy())
-    avg_loss = total_loss / len(dataloader)
     accuracy = accuracy_score(all_labels, all_preds)
     f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
     precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
@@ -70,15 +71,18 @@ def evaluate(model, dataloader, device):
         print(f"Could not calculate AUC-ROC: {e}. Labels: {list(set(all_labels))[:10]}. Probs example: {all_probs_for_auc[:5]}. Setting to 0.0")
         roc_auc = 0.0
-    return {
-        'loss': avg_loss,
         'accuracy': accuracy,
         'f1': f1,
         'roc_auc': roc_auc,
         'precision': precision,
         'recall': recall,
-        'mcc': mcc
     }
 if __name__ == "__main__":
     import argparse
@@ -177,9 +181,14 @@ if __name__ == "__main__":
     test_dataloader = DataLoader(tokenized_imdb_test, batch_size=args.batch_size)
     print("Starting evaluation...")
-    progress_bar = tqdm(test_dataloader, desc="Evaluating")
-    results = evaluate(model, progress_bar, device)
     print("\n--- Evaluation Results ---")
     for key, value in results.items():

     all_labels = []
     all_probs_for_auc = []
     total_loss = 0
+    num_batches = len(dataloader)
+    processed_batches = 0
+    yield "Starting evaluation..."
     with torch.no_grad():
+        for batch in dataloader: # dataloader here should not be pre-wrapped with tqdm by the caller if we yield progress
+            processed_batches += 1
             # Move batch to device, ensure all model inputs are covered
             input_ids = batch['input_ids'].to(device)
             attention_mask = batch['attention_mask'].to(device)
             all_preds.extend(preds.cpu().numpy())
             all_labels.extend(labels.cpu().numpy())
+            # Yield progress update
+            if processed_batches % (num_batches // 20) == 0 or processed_batches == num_batches: # Update roughly 20 times + final
+                yield f"Processed {processed_batches}/{num_batches} batches ({processed_batches/num_batches*100:.2f}%)"
+    avg_loss = total_loss / num_batches
     accuracy = accuracy_score(all_labels, all_preds)
     f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
     precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
         print(f"Could not calculate AUC-ROC: {e}. Labels: {list(set(all_labels))[:10]}. Probs example: {all_probs_for_auc[:5]}. Setting to 0.0")
         roc_auc = 0.0
+    results = {
         'accuracy': accuracy,
         'f1': f1,
         'roc_auc': roc_auc,
         'precision': precision,
         'recall': recall,
+        'mcc': mcc,
+        'average_loss': avg_loss
     }
+    yield f"Processed {processed_batches}/{num_batches} batches (100.00%)" # Ensure final progress update
+    yield "Evaluation complete. Compiling results..."
+    yield results
 if __name__ == "__main__":
     import argparse
     test_dataloader = DataLoader(tokenized_imdb_test, batch_size=args.batch_size)
     print("Starting evaluation...")
+    progress_bar = tqdm(evaluate(model, test_dataloader, device), desc="Evaluating")
+    for update in progress_bar:
+        if isinstance(update, dict):
+            results = update
+            break
+        else:
+            progress_bar.set_postfix_str(update)
     print("\n--- Evaluation Results ---")
     for key, value in results.items():