File size: 7,388 Bytes
e0a433a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
class Preprocessing:
def __init__(self, data):
"""Initialize with the dataset."""
self.data = data
def clean_data(self):
"""Cleans and preprocesses the dataset."""
# Drop duplicates
self.data = self.data.drop_duplicates()
self.data = self.data.drop(['area_type', 'availability', 'society', 'balcony'], axis=1)
self.data=self.data.dropna()
# Drop rows with missing target values
if 'price' in self.data.columns:
self.data = self.data.dropna(subset=['price'])
# Fill missing values for numerical columns with median
numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
self.data[numeric_cols] = self.data[numeric_cols].fillna(self.data[numeric_cols].median())
# Fill missing values for categorical columns with mode
categorical_cols = self.data.select_dtypes(include=['object']).columns
self.data[categorical_cols] = self.data[categorical_cols].fillna(self.data[categorical_cols].mode().iloc[0])
# Group rare locations
if 'location' in self.data.columns:
location_stats = self.data['location'].value_counts()
location_stats_lessthan_10 = location_stats[location_stats <= 10]
self.data['location'] = self.data['location'].apply(
lambda x: 'other' if x in location_stats_lessthan_10 else x
)
return self.data
def convert_rangesqft_to_avg(self, x):
"""Convert ' - ' separated range sqftarea values to an average."""
token = x.split('-')
if len(token) == 2:
return (float(token[0]) + float(token[1])) / 2
try:
return float(x)
except:
return None
def feature_engineering(self):
"""Extracts the "integer" from text bhk or many forms from the 'size' column."""
self.data['bhk'] = self.data['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)
del self.data['size'] # Remove the 'size' column
# Convert 'total_sqft' ranges to average values if the column exists
if 'total_sqft' in self.data.columns:
self.data['total_sqft'] = self.data['total_sqft'].apply(self.convert_rangesqft_to_avg) # Apply the function to each value
# Drop rows where 'total_sqft' is less than 300 times the number of bedrooms (bhk)
if 'total_sqft' in self.data.columns and 'bhk' in self.data.columns:
self.data = self.data[~(self.data['total_sqft'] / self.data['bhk'] < 300)]
if 'bhk' in self.data.columns and 'bath' in self.data.columns:
self.data = self.data[self.data['bhk'] + 2 > self.data['bath']]
"""Creates new features and drops irrelevant ones."""
# Create a new feature 'price_per_sqft' if 'total_sqft' and 'price' columns exist
if 'total_sqft' in self.data.columns and 'price' in self.data.columns:
self.data['price_per_sqft'] = self.data['price']*100000 / self.data['total_sqft']
return self.data
def remove_bhk_outliers(self):
"""Removes outliers based on price_per_sqft for bhk values within each location."""
exclude_indices = []
for location, location_df in self.data.groupby('location'):
# Calculate statistics for each bhk in the location
bhk_stats = {}
for bhk, bhk_df in location_df.groupby('bhk'):
bhk_stats[bhk] = {
'mean': np.mean(bhk_df['price_per_sqft']),
'std': np.std(bhk_df['price_per_sqft']),
'count': bhk_df.shape[0]
}
# Identify outliers for each bhk in the location
for bhk, bhk_df in location_df.groupby('bhk'):
stats = bhk_stats.get(bhk - 1)
if stats and stats['count'] > 5:
exclude_indices.extend(
bhk_df[bhk_df['price_per_sqft'] < stats['mean']].index.values
)
# Drop identified outliers
self.data = self.data.drop(index=exclude_indices)
print(f"Removed {len(exclude_indices)} outliers based on bhk and price_per_sqft.")
return self.data
def encode_features(self):
"""Encodes categorical features using pandas.get_dummies for one-hot encoding."""
categorical_cols = self.data.select_dtypes(include=['object']).columns
if categorical_cols.empty:
print("No categorical features found for encoding.")
return self.data
# Create one-hot encoded columns for each categorical feature
dummies = pd.get_dummies(self.data['location'], drop_first=True)
dummies = dummies.astype(int) # Convert to integers for consistency
self.data = pd.concat([self.data, dummies], axis=1) # Add dummies to the dataset
# Drop original location column
self.data = self.data.drop(columns=['location'])
print(f"Categorical features encoded: {len(categorical_cols)}")
print(f"New dataset shape after encoding: {self.data.shape}")
return self.data
def scale_features(self):
"""Scales numerical features using StandardScaler."""
scaler = StandardScaler()
numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
self.data[numeric_cols] = scaler.fit_transform(self.data[numeric_cols])
return self.data
def handle_missing_values(self):
"""Handles remaining missing values after scaling."""
# Drop rows with missing values
self.data = self.data.dropna()
return self.data
def split_data(self, target_column, test_size=0.2, random_state=42):
"""Splits the dataset into training and testing sets.
Args:
target_column (str): The column to be used as the target variable.
test_size (float): Proportion of the dataset to include in the test split.
random_state (int): Random seed for reproducibility.
Returns:
tuple: X_train, X_test, y_train, y_test
"""
if target_column not in self.data.columns:
raise ValueError(f"Target column '{target_column}' not found in the dataset.")
X = self.data.drop(columns=[target_column])
y = self.data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
return X_train, X_test, y_train, y_test
# Example Usage
if __name__ == "__main__":
df = pd.read_csv("data/bengaluru_house_prices.csv")
preprocessor = Preprocessing(data=df)
# Data preprocessing steps
preprocessor.clean_data() # Clean the data
preprocessor.feature_engineering() # Perform feature engineering
preprocessor.remove_bhk_outliers() # Remove outliers
preprocessor.encode_features() # Encode features
preprocessor.scale_features() # Scale features
preprocessor.handle_missing_values() # Handle remaining missing values
print(preprocessor.data.columns.tolist())
print(preprocessor.data.shape)
print("\nprocessing completed !!!")
|