Spaces:

WickedFaith
/

Synthack-SyntaxSquad

Running

App Files Files Community

Synthack-SyntaxSquad / src /train_model.py

WickedFaith

Upload 77 files

3efedb0 verified about 2 months ago

raw

history blame

2.57 kB

	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.ensemble import RandomForestClassifier
	import pickle
	import os

	# Get the project root directory
	project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

	# Load the dataset
	data_path = os.path.join(project_root, 'data', 'loan_approval_dataset.csv')
	print(f"Loading data from: {data_path}")
	df = pd.read_csv(data_path)

	# Clean column names (remove leading/trailing spaces)
	df.columns = df.columns.str.strip()

	# Clean string values (remove leading/trailing spaces)
	for col in df.select_dtypes(include=['object']).columns:
	df[col] = df[col].str.strip()

	# Identify numerical and categorical columns
	numerical_features = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term',
	'cibil_score', 'residential_assets_value', 'commercial_assets_value',
	'luxury_assets_value', 'bank_asset_value']
	categorical_features = ['education', 'self_employed']

	# Prepare features and target
	X = df[numerical_features + categorical_features]
	y = df['loan_status'].map({'Approved': 1, 'Rejected': 0})

	# Create preprocessing pipeline
	numeric_transformer = Pipeline(steps=[
	('scaler', StandardScaler())
	])

	categorical_transformer = Pipeline(steps=[
	('onehot', OneHotEncoder(handle_unknown='ignore'))
	])

	preprocessor = ColumnTransformer(
	transformers=[
	('num', numeric_transformer, numerical_features),
	('cat', categorical_transformer, categorical_features)
	])

	# Create full pipeline
	model = Pipeline(steps=[
	('preprocessor', preprocessor),
	('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
	])

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Fit the model
	model.fit(X_train, y_train)

	# Evaluate the model
	train_score = model.score(X_train, y_train)
	test_score = model.score(X_test, y_test)
	print(f"Train accuracy: {train_score:.3f}")
	print(f"Test accuracy: {test_score:.3f}")

	# Save the model
	models_dir = os.path.join(project_root, 'models')
	os.makedirs(models_dir, exist_ok=True)

	# Save as pickle file
	with open(os.path.join(models_dir, 'loan_model.pkl'), 'wb') as f:
	pickle.dump(model, f)

	print("Model saved successfully as loan_model.pkl!")