File size: 7,388 Bytes
e0a433a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

class Preprocessing:
    def __init__(self, data):
        """Initialize with the dataset."""
        self.data = data

    def clean_data(self):
        """Cleans and preprocesses the dataset."""
        # Drop duplicates
        self.data = self.data.drop_duplicates()
        self.data = self.data.drop(['area_type', 'availability', 'society', 'balcony'], axis=1)
        self.data=self.data.dropna()

        # Drop rows with missing target values
        if 'price' in self.data.columns:
            self.data = self.data.dropna(subset=['price'])

        # Fill missing values for numerical columns with median
        numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
        self.data[numeric_cols] = self.data[numeric_cols].fillna(self.data[numeric_cols].median())

        # Fill missing values for categorical columns with mode
        categorical_cols = self.data.select_dtypes(include=['object']).columns
        self.data[categorical_cols] = self.data[categorical_cols].fillna(self.data[categorical_cols].mode().iloc[0])

        # Group rare locations
        if 'location' in self.data.columns:
            location_stats = self.data['location'].value_counts()
            location_stats_lessthan_10 = location_stats[location_stats <= 10]
            self.data['location'] = self.data['location'].apply(
                lambda x: 'other' if x in location_stats_lessthan_10 else x
            )
        return self.data

    def convert_rangesqft_to_avg(self, x):
        """Convert ' - ' separated range sqftarea values to an average."""
        token = x.split('-')
        if len(token) == 2:
            return (float(token[0]) + float(token[1])) / 2
        try:
            return float(x)
        except:
            return None

    def feature_engineering(self):

        """Extracts the "integer" from text bhk or many forms from the 'size' column."""
        self.data['bhk'] = self.data['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)
        del self.data['size']  # Remove the 'size' column

        # Convert 'total_sqft' ranges to average values if the column exists
        if 'total_sqft' in self.data.columns:
            self.data['total_sqft'] = self.data['total_sqft'].apply(self.convert_rangesqft_to_avg)  # Apply the function to each value

        # Drop rows where 'total_sqft' is less than 300 times the number of bedrooms (bhk)
        if 'total_sqft' in self.data.columns and 'bhk' in self.data.columns:
            self.data = self.data[~(self.data['total_sqft'] / self.data['bhk'] < 300)]


        if 'bhk' in self.data.columns and 'bath' in self.data.columns:
            self.data = self.data[self.data['bhk'] + 2 > self.data['bath']]

        """Creates new features and drops irrelevant ones."""
        # Create a new feature 'price_per_sqft' if 'total_sqft' and 'price' columns exist
        if 'total_sqft' in self.data.columns and 'price' in self.data.columns:
            self.data['price_per_sqft'] = self.data['price']*100000 / self.data['total_sqft']
        return self.data

    def remove_bhk_outliers(self):
        """Removes outliers based on price_per_sqft for bhk values within each location."""
        exclude_indices = []
        
        for location, location_df in self.data.groupby('location'):
            # Calculate statistics for each bhk in the location
            bhk_stats = {}
            for bhk, bhk_df in location_df.groupby('bhk'):
                bhk_stats[bhk] = {
                    'mean': np.mean(bhk_df['price_per_sqft']),
                    'std': np.std(bhk_df['price_per_sqft']),
                    'count': bhk_df.shape[0]
                }

            # Identify outliers for each bhk in the location
            for bhk, bhk_df in location_df.groupby('bhk'):
                stats = bhk_stats.get(bhk - 1)
                if stats and stats['count'] > 5:
                    exclude_indices.extend(
                        bhk_df[bhk_df['price_per_sqft'] < stats['mean']].index.values
                    )

        # Drop identified outliers
        self.data = self.data.drop(index=exclude_indices)
        print(f"Removed {len(exclude_indices)} outliers based on bhk and price_per_sqft.")
        return self.data
    
    def encode_features(self):
        """Encodes categorical features using pandas.get_dummies for one-hot encoding."""
        categorical_cols = self.data.select_dtypes(include=['object']).columns
        if categorical_cols.empty:
            print("No categorical features found for encoding.")

            return self.data

        # Create one-hot encoded columns for each categorical feature
        dummies = pd.get_dummies(self.data['location'], drop_first=True)
        dummies = dummies.astype(int)  # Convert to integers for consistency
        self.data = pd.concat([self.data, dummies], axis=1)  # Add dummies to the dataset

        # Drop original location column
        self.data = self.data.drop(columns=['location'])

        print(f"Categorical features encoded: {len(categorical_cols)}")
        print(f"New dataset shape after encoding: {self.data.shape}")

        return self.data

    def scale_features(self):
        """Scales numerical features using StandardScaler."""
        scaler = StandardScaler()
        numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
        self.data[numeric_cols] = scaler.fit_transform(self.data[numeric_cols])
        return self.data
    
    def handle_missing_values(self):
        """Handles remaining missing values after scaling."""
        # Drop rows with missing values
        self.data = self.data.dropna()
        return self.data

    def split_data(self, target_column, test_size=0.2, random_state=42):
        """Splits the dataset into training and testing sets.

        Args:
            target_column (str): The column to be used as the target variable.
            test_size (float): Proportion of the dataset to include in the test split.
            random_state (int): Random seed for reproducibility.

        Returns:
            tuple: X_train, X_test, y_train, y_test
        """
        if target_column not in self.data.columns:
            raise ValueError(f"Target column '{target_column}' not found in the dataset.")

        X = self.data.drop(columns=[target_column])
        y = self.data[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_test, y_train, y_test

# Example Usage
if __name__ == "__main__":
    df = pd.read_csv("data/bengaluru_house_prices.csv")

    preprocessor = Preprocessing(data=df)
    # Data preprocessing steps
    preprocessor.clean_data()  # Clean the data
    preprocessor.feature_engineering()  # Perform feature engineering
    preprocessor.remove_bhk_outliers()  # Remove outliers
    preprocessor.encode_features()  # Encode features
    preprocessor.scale_features()  # Scale features
    preprocessor.handle_missing_values()  # Handle remaining missing values
    print(preprocessor.data.columns.tolist())
    print(preprocessor.data.shape)
    print("\nprocessing completed !!!")