Spaces:

Maaz1
/

Banglore_RealEstate_forecast-using-CICD-piplines

Sleeping

Banglore_RealEstate_forecast-using-CICD-piplines

File size: 7,388 Bytes

e0a433a

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

class Preprocessing:
    def __init__(self, data):
        """Initialize with the dataset."""
        self.data = data

    def clean_data(self):
        """Cleans and preprocesses the dataset."""
        # Drop duplicates
        self.data = self.data.drop_duplicates()
        self.data = self.data.drop(['area_type', 'availability', 'society', 'balcony'], axis=1)
        self.data=self.data.dropna()

        # Drop rows with missing target values
        if 'price' in self.data.columns:
            self.data = self.data.dropna(subset=['price'])

        # Fill missing values for numerical columns with median
        numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
        self.data[numeric_cols] = self.data[numeric_cols].fillna(self.data[numeric_cols].median())

        # Fill missing values for categorical columns with mode
        categorical_cols = self.data.select_dtypes(include=['object']).columns
        self.data[categorical_cols] = self.data[categorical_cols].fillna(self.data[categorical_cols].mode().iloc[0])

        # Group rare locations
        if 'location' in self.data.columns:
            location_stats = self.data['location'].value_counts()
            location_stats_lessthan_10 = location_stats[location_stats <= 10]
            self.data['location'] = self.data['location'].apply(
                lambda x: 'other' if x in location_stats_lessthan_10 else x
            )
        return self.data

    def convert_rangesqft_to_avg(self, x):
        """Convert ' - ' separated range sqftarea values to an average."""
        token = x.split('-')
        if len(token) == 2:
            return (float(token[0]) + float(token[1])) / 2
        try:
            return float(x)
        except:
            return None

    def feature_engineering(self):

        """Extracts the "integer" from text bhk or many forms from the 'size' column."""
        self.data['bhk'] = self.data['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)
        del self.data['size']  # Remove the 'size' column

        # Convert 'total_sqft' ranges to average values if the column exists
        if 'total_sqft' in self.data.columns:
            self.data['total_sqft'] = self.data['total_sqft'].apply(self.convert_rangesqft_to_avg)  # Apply the function to each value

        # Drop rows where 'total_sqft' is less than 300 times the number of bedrooms (bhk)
        if 'total_sqft' in self.data.columns and 'bhk' in self.data.columns:
            self.data = self.data[~(self.data['total_sqft'] / self.data['bhk'] < 300)]


        if 'bhk' in self.data.columns and 'bath' in self.data.columns:
            self.data = self.data[self.data['bhk'] + 2 > self.data['bath']]

        """Creates new features and drops irrelevant ones."""
        # Create a new feature 'price_per_sqft' if 'total_sqft' and 'price' columns exist
        if 'total_sqft' in self.data.columns and 'price' in self.data.columns:
            self.data['price_per_sqft'] = self.data['price']*100000 / self.data['total_sqft']
        return self.data

    def remove_bhk_outliers(self):
        """Removes outliers based on price_per_sqft for bhk values within each location."""
        exclude_indices = []
        
        for location, location_df in self.data.groupby('location'):
            # Calculate statistics for each bhk in the location
            bhk_stats = {}
            for bhk, bhk_df in location_df.groupby('bhk'):
                bhk_stats[bhk] = {
                    'mean': np.mean(bhk_df['price_per_sqft']),
                    'std': np.std(bhk_df['price_per_sqft']),
                    'count': bhk_df.shape[0]
                }

            # Identify outliers for each bhk in the location
            for bhk, bhk_df in location_df.groupby('bhk'):
                stats = bhk_stats.get(bhk - 1)
                if stats and stats['count'] > 5:
                    exclude_indices.extend(
                        bhk_df[bhk_df['price_per_sqft'] < stats['mean']].index.values
                    )

        # Drop identified outliers
        self.data = self.data.drop(index=exclude_indices)
        print(f"Removed {len(exclude_indices)} outliers based on bhk and price_per_sqft.")
        return self.data
    
    def encode_features(self):
        """Encodes categorical features using pandas.get_dummies for one-hot encoding."""
        categorical_cols = self.data.select_dtypes(include=['object']).columns
        if categorical_cols.empty:
            print("No categorical features found for encoding.")

            return self.data

        # Create one-hot encoded columns for each categorical feature
        dummies = pd.get_dummies(self.data['location'], drop_first=True)
        dummies = dummies.astype(int)  # Convert to integers for consistency
        self.data = pd.concat([self.data, dummies], axis=1)  # Add dummies to the dataset

        # Drop original location column
        self.data = self.data.drop(columns=['location'])

        print(f"Categorical features encoded: {len(categorical_cols)}")
        print(f"New dataset shape after encoding: {self.data.shape}")

        return self.data

    def scale_features(self):
        """Scales numerical features using StandardScaler."""
        scaler = StandardScaler()
        numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
        self.data[numeric_cols] = scaler.fit_transform(self.data[numeric_cols])
        return self.data
    
    def handle_missing_values(self):
        """Handles remaining missing values after scaling."""
        # Drop rows with missing values
        self.data = self.data.dropna()
        return self.data

    def split_data(self, target_column, test_size=0.2, random_state=42):
        """Splits the dataset into training and testing sets.

        Args:
            target_column (str): The column to be used as the target variable.
            test_size (float): Proportion of the dataset to include in the test split.
            random_state (int): Random seed for reproducibility.

        Returns:
            tuple: X_train, X_test, y_train, y_test
        """
        if target_column not in self.data.columns:
            raise ValueError(f"Target column '{target_column}' not found in the dataset.")

        X = self.data.drop(columns=[target_column])
        y = self.data[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_test, y_train, y_test

# Example Usage
if __name__ == "__main__":
    df = pd.read_csv("data/bengaluru_house_prices.csv")

    preprocessor = Preprocessing(data=df)
    # Data preprocessing steps
    preprocessor.clean_data()  # Clean the data
    preprocessor.feature_engineering()  # Perform feature engineering
    preprocessor.remove_bhk_outliers()  # Remove outliers
    preprocessor.encode_features()  # Encode features
    preprocessor.scale_features()  # Scale features
    preprocessor.handle_missing_values()  # Handle remaining missing values
    print(preprocessor.data.columns.tolist())
    print(preprocessor.data.shape)
    print("\nprocessing completed !!!")