pgurazada1 commited on
Commit
dd2fed7
·
verified ·
1 Parent(s): e3dd54d

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +76 -0
train.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+
3
+ from sklearn.datasets import fetch_openml
4
+
5
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
6
+ from sklearn.compose import make_column_transformer
7
+
8
+ from sklearn.pipeline import make_pipeline
9
+
10
+ from sklearn.model_selection import train_test_split, RandomizedSearchCV
11
+
12
+ from sklearn.linear_model import LogisticRegression
13
+ from sklearn.metrics import accuracy_score, classification_report
14
+
15
+ dataset = fetch_openml(data_id=42890, as_frame=True, parser="auto")
16
+
17
+ data_df = dataset.data
18
+
19
+ target = 'Machine failure'
20
+ numeric_features = [
21
+ 'Air temperature [K]',
22
+ 'Process temperature [K]',
23
+ 'Rotational speed [rpm]',
24
+ 'Torque [Nm]',
25
+ 'Tool wear [min]'
26
+ ]
27
+ categorical_features = ['Type']
28
+
29
+ print("Creating Data Subsets")
30
+
31
+ X = data_df[numeric_features + categorical_features]
32
+ y = data_df[target]
33
+
34
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
35
+ X, y,
36
+ test_size=0.2,
37
+ random_state=42
38
+ )
39
+
40
+ preprocessor = make_column_transformer(
41
+ (StandardScaler(), numeric_features),
42
+ (OneHotEncoder(handle_unknown='ignore'), categorical_features)
43
+ )
44
+
45
+ model_logistic_regression = LogisticRegression(n_jobs=-1)
46
+
47
+ print("Estimating the Best Model Pipeline")
48
+
49
+ model_pipeline = make_pipeline(
50
+ preprocessor,
51
+ model_logistic_regression
52
+ )
53
+
54
+ param_distribution = {
55
+ "logisticregression__C": [0.001, 0.01, 0.1, 0.5, 1, 5, 10]
56
+ }
57
+
58
+ rand_search_cv = RandomizedSearchCV(
59
+ model_pipeline,
60
+ param_distribution,
61
+ n_iter=3,
62
+ cv=3,
63
+ random_state=42
64
+ )
65
+
66
+ rand_search_cv.fit(Xtrain, ytrain)
67
+
68
+ print("Logging Metrics")
69
+ print(f"Accuracy: {rand_search_cv.best_score_}")
70
+ print(f"Best parameters: {rand_search_cv.best_params_}")
71
+
72
+ print("Serializing the Best Model")
73
+
74
+ saved_model_path = "model.joblib"
75
+
76
+ joblib.dump(rand_search_cv.best_estimator_, saved_model_path)