File size: 1,206 Bytes
5cbc1e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Load the data
train_data = pd.read_csv("./input/train.csv")

# Prepare the features and labels
X = train_data["comment_text"]
y = train_data.iloc[:, 2:]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Train a logistic regression model for each label
scores = []
for label in y.columns:
    lr = LogisticRegression(C=1.0, solver="liblinear")
    lr.fit(X_train_tfidf, y_train[label])
    y_pred = lr.predict_proba(X_val_tfidf)[:, 1]
    score = roc_auc_score(y_val[label], y_pred)
    scores.append(score)
    print(f"ROC AUC for {label}: {score}")

# Calculate the mean column-wise ROC AUC
mean_auc = sum(scores) / len(scores)
print(f"Mean column-wise ROC AUC: {mean_auc}")