Spaces:

Maaz1
/

Banglore_RealEstate_forecast-using-CICD-piplines

Sleeping

App Files Files Community

Maaz Uddin commited on Mar 10

Commit

e0a433a

1 Parent(s): e7769d4

allfilesupload

Browse files

Files changed (22) hide show

.gitattributes +1 -35
.github/workflows/python-app.yml +67 -0
.gitignore +171 -0
README.md +184 -9
data/bengaluru_house_prices.csv +0 -0
dockerfile +16 -0
images/homepage.png +0 -0
images/predicted_results.jpg +0 -0
images/temp +1 -0
main.py +46 -0
models/feature_names.pkl +0 -0
models/lr_regg.pkl +0 -0
notebooks/Analysis_notebook.ipynb +0 -0
setup.py +17 -0
src/EDA.py +105 -0
src/model.py +103 -0
src/preprocessing.py +169 -0
templates/index.html +114 -0
templates/results.html +66 -0
tests/__init__.py +1 -0
tests/test2direct.py +91 -0
tests/test_model.py +76 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ *.ipynb linguist-detectable=false

.github/workflows/python-app.yml ADDED Viewed

	@@ -0,0 +1,67 @@

+name: Python application
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+permissions:
+  contents: read
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip setuptools wheel
+        pip install flake8 pytest pytest-flask
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Verify model and template files
+      run: |
+        if [ ! -f "models/lr_regg.pkl" ] || [ ! -f "models/feature_names.pkl" ]; then
+          echo "Model files missing!"
+          exit 1
+        fi
+        if [ ! -d "templates" ] || [ ! -f "templates/index.html" ]; then
+          echo "Template files missing!"
+          exit 1
+        fi
+    - name: Lint with flake8
+      run: |
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: python -m pytest tests/ -v
+    - name: Start and Test Flask App
+      run: |
+        python app.py &
+        sleep 10
+        curl --retry 5 --retry-delay 5 --retry-connrefused http://127.0.0.1:5000/ || exit 1
+        pkill -f "python app.py"
+      env:
+        FLASK_ENV: testing
+        FLASK_DEBUG: 0
+    - name: Check setup.py
+      run: |
+        if [ -f setup.py ]; then
+          python setup.py check
+          python setup.py sdist bdist_wheel
+          pip install -e .
+        else
+          echo "setup.py not found!"
+        fi

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc

README.md CHANGED Viewed

@@ -1,12 +1,187 @@
 ---
-title: Banglore RealEstate Forecast-using-CICD-piplines
-emoji: 🏆
-colorFrom: pink
-colorTo: gray
-sdk: docker
-pinned: false
-license: apache-2.0
-short_description: 'Get recent treands in Housing and Realestate market '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+layout: default
+title: Real Estate Price Prediction API
+description: A machine learning powered real estate price prediction tool with web interface
 ---
+# 🏠 **Real Estate Price Prediction API**
+## 🌟 **Project Motto**
+This project aims to provide an accurate and interactive **Real Estate Price Prediction tool**. Users can input details such as property location, square footage, number of bedrooms, and bathrooms to get an **instant price prediction** based on a trained **machine learning model**.
+This API bridges the gap between **data science** and **user-friendly deployment**, allowing seamless integration of advanced predictions into real-world applications.
+---
+## 💡 **How It Works**
+1. **Data Processing & Model Training**
+   - A dataset of real estate transactions was cleaned and processed.
+   - Key features such as `location`, `total_sqft`, `bath`, and `bhk` were selected.
+   - A **Linear Regression model** was trained and stored as a `.pkl` file for deployment.
+2. **Prediction Mechanism**
+   - The trained model is loaded and predicts property prices based on user inputs.
+   - Location data is one-hot encoded to handle categorical features.
+3. **Interactive Frontend**
+   - A Flask-powered web app provides an intuitive interface for predictions.
+   - Users input details via forms, and results are displayed instantly.
+4. **API Integration**
+   - A `/predict` endpoint allows developers to integrate the model with other applications.
+---
+## 🎥 Watch the Demo(click image below👇)
+[![Watch on YouTube](https://img.youtube.com/vi/NcmXkE907io/0.jpg)](https://www.youtube.com/watch?v=NcmXkE907io)
+---
+## 📷 **Screenshots**
+### Home Page
+![Home Page](images/homepage.png)
+### Prediction Results
+![Prediction Result](images/predicted_results.jpg)
+---
+## 📂 **Project Structure**
+```
+├── .github/
+│   └── workflows/
+│       └── python-app.yml          # CI/CD workflow configuration
+├── data/                           # Dataset directory
+│   └── bengaluru_house_prices.csv  # Dataset file for the project
+├── models/                         # Saved models and feature names
+│   ├── feature_names.pkl           # Pickled feature names
+│   └── lr_regg.pkl                 # Trained regression model
+├── src/                            # Source code for the project
+│   ├── EDA.py                      # Exploratory Data Analysis script
+│   ├── model.py                    # Model training and evaluation script
+│   └── preprocessing.py            # Data preprocessing logic
+├── templates/                      # HTML templates for the Flask web app
+│   ├── index.html                  # User input form for predictions
+│   └── results.html                # Displays prediction results
+├── tests/                          # Unit testing for the project
+│   ├── __init__.py                 # Marks the directory as a package
+│   ├── test_model.py               # Tests for the model
+│   └── test2direct.py              # Additional test script
+├── .gitignore                      # Specifies ignored files for Git
+├── app.py                          # Flask application entry point
+├── main.py                         # Main execution script
+├── requirements.txt                # List of dependencies for the project
+├── setup.py                        # Setup script for packaging the project
+├── README.md                       # Project overview and documentation
+```
+---
+## 🚀 **Features**
+- **Accurate Price Predictions** using a trained regression model.
+- **Interactive Web Interface** for user-friendly predictions.
+- **API Integration** for developers to use the model programmatically.
+- **Scalable and Extendable** to new locations or additional features.
+---
+## 🛠️ **Installation and Setup**
+### Prerequisites
+- Python 3.8+
+- Flask
+- Pickle
+### Installation Steps
+1. Clone the repository:
+   ```bash
+ git clone https://github.com/Maazuddin1/Banglore_RealEstate_forecast-using-CICD-piplines.git
+ cd Banglore_RealEstate_forecast-using-CICD-piplines
+   ```
+2. Create a virtual environment:
+   ```bash
+   python -m venv env
+   source env/bin/activate  # Linux/Mac
+   env\Scripts\activate     # Windows
+   ```
+3. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+4. Start the Flask application:
+   ```bash
+   python app.py
+   ```
+5. Open your browser and navigate to `http://127.0.0.1:5000/`.
+---
+## 🌐 **API Usage**
+### Endpoint: `/predict`
+**Method**: `POST`
+**Input** (JSON):
+```json
+{
+  "location": "Whitefield",
+  "sqft": 1200,
+  "bath": 2,
+  "bhk": 3
+}
+```
+**Output**:
+```json
+{
+  "predicted_price": 94.23 Lakhs
+}
+```
+---
+## 🔍 **Model Details**
+The trained model uses **Linear Regression** with key features like:
+- **total_sqft**: Total square footage of the property.
+- **bath**: Number of bathrooms.
+- **bhk**: Number of bedrooms.
+- **Location**: One-hot encoded for categorical support.
+---
+## 📈 **Future Enhancements**
+- Add support for more advanced machine learning models like Random Forest or XGBoost.
+- Improve UI design with frameworks like Bootstrap.
+- Expand location datasets for better predictions.
+- Add real-time price scraping for dynamic updates.
+---
+## 🖼️ **Visual Workflow**
+```mermaid
+graph TD
+A[User Input] --> B[Flask App]
+B --> C[Process Input Features]
+C --> D[Trained ML Model]
+D --> E[Predict Price]
+E --> F[Display Results]
+```
+---
+## 🌟 **Contributions**
+Contributions are welcome! Feel free to fork this repository, open issues, or submit pull requests.
+---
+## 📄 **License**
+-
+---

data/bengaluru_house_prices.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

images/homepage.png ADDED Viewed

images/predicted_results.jpg ADDED Viewed

images/temp ADDED Viewed

	@@ -0,0 +1 @@


1	+ .

main.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from src.preprocessing import Preprocessing
+from src.model import ModelBuilder
+from tests import test2direct
+import pandas as pd
+import pickle
+import os
+def main():
+    # Load the dataset
+    data = pd.read_csv("data/bengaluru_house_prices.csv")
+    # Preprocess the data
+    print("Starting Data Preprocessing...")
+    preprocessor = Preprocessing(data)
+    preprocessor.clean_data()
+    preprocessor.feature_engineering()
+    preprocessor.remove_bhk_outliers()
+    preprocessor.encode_features()
+    preprocessor.scale_features()
+    preprocessor.handle_missing_values()
+    print("Preprocessing completed!")
+    # Build and evaluate the model
+    print("Starting Model Building and Evaluation...")
+    model_builder = ModelBuilder(data=preprocessor.data)
+    X_train, X_test, y_train, y_test = model_builder.split_data(target_column='price')
+    model_builder.train_model(X_train, y_train)
+    mse, r2 = model_builder.evaluate_model(X_test, y_test)
+    #print(f"Model Evaluation:\nMean Squared Error: {mse}\nR2 Score: {r2}")
+    # Save the trained model
+    print("Trained model saved successfully!")
+    # Save the trained model as a pickle file
+    model_builder.save_model_as_pickle()
+    # Save the feature names as a pickle file
+    model_builder.save_features_as_pickle(data=preprocessor.data)
+    test2direct.main()
+if __name__ == "__main__":
+    main()

models/feature_names.pkl ADDED Viewed

Binary file (3.63 kB). View file

models/lr_regg.pkl ADDED Viewed

Binary file (7.91 kB). View file

notebooks/Analysis_notebook.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

setup.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from setuptools import setup, find_packages
+setup(
+    name="Banglore_house_price_estimator",
+    version="1.0",
+    description="A machine learning project for house price prediction in Banglore",
+    author="Maaz uddin",
+    packages=find_packages(),
+    install_requires=[
+        "flask",
+        "pandas",
+        "numpy",
+        "scikit-learn",
+        "seaborn",
+        "matplotlib"
+    ]
+)

src/EDA.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+class EDA:
+    def __init__(self, data_path):
+        """Initialize with dataset path."""
+        self.data_path = data_path
+        self.data = None
+    def load_data(self):
+        """Loads the dataset from the provided path."""
+        self.data = pd.read_csv(self.data_path)
+        return self.data
+    def basic_info(self):
+        """Displays basic information about the dataset."""
+        print("\nDataset Info:\n")
+        print(self.data.info())
+        print("\nShape:", self.data.shape)
+        print("\nMissing Values:\n", self.data.isnull().sum())
+        print("\nDuplicate Rows:", self.data.duplicated().sum())
+        return self.data.describe()
+    def missing_value_analysis(self):
+        """Analyzes and visualizes missing values."""
+        missing_data = self.data.isnull().sum()
+        missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
+        if not missing_data.empty:
+            plt.figure(figsize=(8, 6))
+            sns.barplot(x=missing_data.index, y=missing_data.values, palette='viridis')
+            plt.title('Missing Values Count')
+            plt.xticks(rotation=45)
+            plt.ylabel('Count')
+            plt.show()
+        return missing_data
+    def visualize_distributions(self):
+        """Visualizes distributions of numerical features."""
+        numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
+        self.data[numeric_cols].hist(bins=15, figsize=(10, 8), color='skyblue', edgecolor='black')
+        plt.suptitle('Feature Distributions', fontsize=16)
+        plt.show()
+    def correlation_heatmap(self):
+        """Plots a heatmap of feature correlations."""
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(self.data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
+        plt.title('Feature Correlation Heatmap')
+        plt.show()
+    def detect_outliers(self, feature):
+        """Detects and visualizes outliers for a given feature."""
+        plt.figure(figsize=(8, 6))
+        sns.boxplot(x=self.data[feature], color='lightblue')
+        plt.title(f'Outliers in {feature}')
+        plt.show()
+    def feature_summary(self):
+        """Provides a summary of categorical and numerical features."""
+        categorical_cols = self.data.select_dtypes(include=['object']).columns
+        numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
+        print("\nCategorical Features:")
+        for col in categorical_cols:
+            print(f"{col}: {self.data[col].nunique()} unique values")
+            print(self.data[col].value_counts().head(10))
+            print("---")
+        print("\nNumerical Features:")
+        for col in numeric_cols:
+            print(f"{col}: Mean={self.data[col].mean()}, Median={self.data[col].median()}, Std={self.data[col].std()}")
+            print("---")
+    def pairwise_scatterplots(self, features):
+        """Plots scatterplots for selected features."""
+        sns.pairplot(self.data[features], diag_kind='kde', plot_kws={'alpha': 0.5})
+        plt.suptitle('Pairwise Scatterplots', fontsize=16)
+        plt.show()
+    def target_analysis(self, target_col):
+        """Analyzes target variable distribution."""
+        plt.figure(figsize=(8, 6))
+        sns.histplot(self.data[target_col], kde=True, bins=30, color='blue')
+        plt.title(f'Distribution of {target_col}')
+        plt.xlabel(target_col)
+        plt.ylabel('Frequency')
+        plt.show()
+if __name__ == "__main__":
+    eda = EDA(data_path="data/bengaluru_house_prices.csv")
+    data = eda.load_data()
+    eda.basic_info()
+    eda.missing_value_analysis()
+    eda.visualize_distributions()
+    eda.correlation_heatmap()
+    eda.detect_outliers('price')
+    eda.feature_summary()
+    eda.pairwise_scatterplots(features=['price', 'total_sqft', 'bath', 'bhk'])
+    eda.target_analysis(target_col='price')
+    print("Missing values summary:")
+    print(eda.missing_value_analysis())

src/model.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+import pickle  # Import pickle for saving models
+import os  # Import os for directory operations
+class ModelBuilder:
+    def __init__(self, data):
+        """Initialize with the dataset."""
+        self.data = data
+        self.model = None
+    def split_data(self, target_column, test_size=0.2, random_state=42):
+        """Splits the data into training and testing sets."""
+        if target_column not in self.data.columns:
+            raise ValueError(f"Target column '{target_column}' not found in the dataset.")
+        X = self.data.drop(columns=[target_column])
+        y = self.data[target_column]
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=random_state
+        )
+        #print('x_test:', X_test.head())
+        #print('First 15 column names:', X_test.columns[:15])
+        #print('First 15 column data:', X_test.iloc[:15, :10])
+        print(f"Data split complete: Train size = {len(X_train)}, Test size = {len(X_test)}")
+        return X_train, X_test, y_train, y_test
+    def train_model(self, X_train, y_train):
+        """Trains a Linear Regression model."""
+        self.model = LinearRegression()
+        self.model.fit(X_train, y_train)
+        print("Model training complete.")
+    def evaluate_model(self, X_test, y_test):
+        """Evaluates the model on the test set."""
+        if self.model is None:
+            raise ValueError("Model has not been trained yet.")
+        y_pred = self.model.predict(X_test)
+        mse = mean_squared_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        accuracy = self.model.score(X_test, y_test)
+        print(f"Model Evaluation:\nMean Squared Error: {mse}\nR2 Score(accuracy): {r2}")
+        return mse, r2
+    def save_model_as_pickle(self, model_path='models/lr_regg.pkl'):
+        """Save the trained model as a pickle file."""
+        if self.model is None:
+            raise ValueError("Model has not been trained yet.")
+        # Create the models directory if it doesn't exist
+        #os.makedirs(os.path.dirname(model_path), exist_ok=True)
+        # Save the model
+        with open(model_path, 'wb') as file:
+            pickle.dump(self.model, file)
+        print(f"Model saved as pickle at {model_path}")
+        return model_path
+    def save_features_as_pickle(self, data, target_column='price', file_path='models/feature_names.pkl'):
+        """
+        Extract feature names from the data and save them as a pickle file.
+        Args:
+            data (pd.DataFrame): Input dataset.
+            target_column (str): Name of the target column to exclude from features.
+            file_path (str): Path to save the pickle file.
+        """
+        # Ensure the target column exists
+        if target_column not in data.columns:
+            raise ValueError(f"Target column '{target_column}' not found in the dataset.")
+        # Drop the target column and extract feature names
+        feature_names = data.drop(columns=[target_column]).columns.tolist()
+        # Ensure directory exists
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        # Save the feature names as a pickle file
+        with open(file_path, "wb") as file:
+            pickle.dump(feature_names, file)
+        print(f"Feature names saved to {file_path}")
+    def load_model_from_pickle(self, model_path):
+        """Load a model from a pickle file."""
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"No model found at {model_path}")
+        with open(model_path, 'rb') as file:
+            self.model = pickle.load(file)
+        print(f"Model loaded from {model_path}")
+        return self.model

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.model_selection import train_test_split
+class Preprocessing:
+    def __init__(self, data):
+        """Initialize with the dataset."""
+        self.data = data
+    def clean_data(self):
+        """Cleans and preprocesses the dataset."""
+        # Drop duplicates
+        self.data = self.data.drop_duplicates()
+        self.data = self.data.drop(['area_type', 'availability', 'society', 'balcony'], axis=1)
+        self.data=self.data.dropna()
+        # Drop rows with missing target values
+        if 'price' in self.data.columns:
+            self.data = self.data.dropna(subset=['price'])
+        # Fill missing values for numerical columns with median
+        numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
+        self.data[numeric_cols] = self.data[numeric_cols].fillna(self.data[numeric_cols].median())
+        # Fill missing values for categorical columns with mode
+        categorical_cols = self.data.select_dtypes(include=['object']).columns
+        self.data[categorical_cols] = self.data[categorical_cols].fillna(self.data[categorical_cols].mode().iloc[0])
+        # Group rare locations
+        if 'location' in self.data.columns:
+            location_stats = self.data['location'].value_counts()
+            location_stats_lessthan_10 = location_stats[location_stats <= 10]
+            self.data['location'] = self.data['location'].apply(
+                lambda x: 'other' if x in location_stats_lessthan_10 else x
+            )
+        return self.data
+    def convert_rangesqft_to_avg(self, x):
+        """Convert ' - ' separated range sqftarea values to an average."""
+        token = x.split('-')
+        if len(token) == 2:
+            return (float(token[0]) + float(token[1])) / 2
+        try:
+            return float(x)
+        except:
+            return None
+    def feature_engineering(self):
+        """Extracts the "integer" from text bhk or many forms from the 'size' column."""
+        self.data['bhk'] = self.data['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)
+        del self.data['size']  # Remove the 'size' column
+        # Convert 'total_sqft' ranges to average values if the column exists
+        if 'total_sqft' in self.data.columns:
+            self.data['total_sqft'] = self.data['total_sqft'].apply(self.convert_rangesqft_to_avg)  # Apply the function to each value
+        # Drop rows where 'total_sqft' is less than 300 times the number of bedrooms (bhk)
+        if 'total_sqft' in self.data.columns and 'bhk' in self.data.columns:
+            self.data = self.data[~(self.data['total_sqft'] / self.data['bhk'] < 300)]
+        if 'bhk' in self.data.columns and 'bath' in self.data.columns:
+            self.data = self.data[self.data['bhk'] + 2 > self.data['bath']]
+        """Creates new features and drops irrelevant ones."""
+        # Create a new feature 'price_per_sqft' if 'total_sqft' and 'price' columns exist
+        if 'total_sqft' in self.data.columns and 'price' in self.data.columns:
+            self.data['price_per_sqft'] = self.data['price']*100000 / self.data['total_sqft']
+        return self.data
+    def remove_bhk_outliers(self):
+        """Removes outliers based on price_per_sqft for bhk values within each location."""
+        exclude_indices = []
+        for location, location_df in self.data.groupby('location'):
+            # Calculate statistics for each bhk in the location
+            bhk_stats = {}
+            for bhk, bhk_df in location_df.groupby('bhk'):
+                bhk_stats[bhk] = {
+                    'mean': np.mean(bhk_df['price_per_sqft']),
+                    'std': np.std(bhk_df['price_per_sqft']),
+                    'count': bhk_df.shape[0]
+                }
+            # Identify outliers for each bhk in the location
+            for bhk, bhk_df in location_df.groupby('bhk'):
+                stats = bhk_stats.get(bhk - 1)
+                if stats and stats['count'] > 5:
+                    exclude_indices.extend(
+                        bhk_df[bhk_df['price_per_sqft'] < stats['mean']].index.values
+                    )
+        # Drop identified outliers
+        self.data = self.data.drop(index=exclude_indices)
+        print(f"Removed {len(exclude_indices)} outliers based on bhk and price_per_sqft.")
+        return self.data
+    def encode_features(self):
+        """Encodes categorical features using pandas.get_dummies for one-hot encoding."""
+        categorical_cols = self.data.select_dtypes(include=['object']).columns
+        if categorical_cols.empty:
+            print("No categorical features found for encoding.")
+            return self.data
+        # Create one-hot encoded columns for each categorical feature
+        dummies = pd.get_dummies(self.data['location'], drop_first=True)
+        dummies = dummies.astype(int)  # Convert to integers for consistency
+        self.data = pd.concat([self.data, dummies], axis=1)  # Add dummies to the dataset
+        # Drop original location column
+        self.data = self.data.drop(columns=['location'])
+        print(f"Categorical features encoded: {len(categorical_cols)}")
+        print(f"New dataset shape after encoding: {self.data.shape}")
+        return self.data
+    def scale_features(self):
+        """Scales numerical features using StandardScaler."""
+        scaler = StandardScaler()
+        numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
+        self.data[numeric_cols] = scaler.fit_transform(self.data[numeric_cols])
+        return self.data
+    def handle_missing_values(self):
+        """Handles remaining missing values after scaling."""
+        # Drop rows with missing values
+        self.data = self.data.dropna()
+        return self.data
+    def split_data(self, target_column, test_size=0.2, random_state=42):
+        """Splits the dataset into training and testing sets.
+        Args:
+            target_column (str): The column to be used as the target variable.
+            test_size (float): Proportion of the dataset to include in the test split.
+            random_state (int): Random seed for reproducibility.
+        Returns:
+            tuple: X_train, X_test, y_train, y_test
+        """
+        if target_column not in self.data.columns:
+            raise ValueError(f"Target column '{target_column}' not found in the dataset.")
+        X = self.data.drop(columns=[target_column])
+        y = self.data[target_column]
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
+        return X_train, X_test, y_train, y_test
+# Example Usage
+if __name__ == "__main__":
+    df = pd.read_csv("data/bengaluru_house_prices.csv")
+    preprocessor = Preprocessing(data=df)
+    # Data preprocessing steps
+    preprocessor.clean_data()  # Clean the data
+    preprocessor.feature_engineering()  # Perform feature engineering
+    preprocessor.remove_bhk_outliers()  # Remove outliers
+    preprocessor.encode_features()  # Encode features
+    preprocessor.scale_features()  # Scale features
+    preprocessor.handle_missing_values()  # Handle remaining missing values
+    print(preprocessor.data.columns.tolist())
+    print(preprocessor.data.shape)
+    print("\nprocessing completed !!!")

templates/index.html ADDED Viewed

	@@ -0,0 +1,114 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>House Price Predictor</title>
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
+    <style>
+        body {
+            padding: 20px;
+            background-color: #f8f9fa;
+        }
+        .container {
+            max-width: 800px;
+            background-color: white;
+            padding: 30px;
+            border-radius: 10px;
+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
+            margin-top: 50px;
+        }
+        .prediction-result {
+            margin-top: 20px;
+            padding: 20px;
+            border-radius: 5px;
+            background-color: #e9ecef;
+        }
+        .property-details {
+            margin-top: 20px;
+            padding: 15px;
+            border: 1px solid #dee2e6;
+            border-radius: 5px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h2 class="text-center mb-4">Bangalore House Price Predictor</h2>
+        {% if error %}
+        <div class="alert alert-danger" role="alert">
+            {{ error }}
+        </div>
+        {% endif %}
+        <form method="POST" class="needs-validation" novalidate>
+            <div class="mb-3">
+                <label for="location" class="form-label">Location:</label>
+                <select class="form-select" id="location" name="location" required>
+                    <option value="">Select a location</option>
+                    {% for location in locations %}
+                    <option value="{{ location }}">{{ location }}</option>
+                    {% endfor %}
+                </select>
+            </div>
+            <div class="mb-3">
+                <label for="sqft" class="form-label">Total Square Feet:</label>
+                <input type="number" class="form-control" id="sqft" name="sqft" min="100" required>
+            </div>
+            <div class="mb-3">
+                <label for="bath" class="form-label">Number of Bathrooms:</label>
+                <input type="number" class="form-control" id="bath" name="bath" min="1" max="10" required>
+            </div>
+            <div class="mb-3">
+                <label for="bhk" class="form-label">BHK (Bedrooms):</label>
+                <input type="number" class="form-control" id="bhk" name="bhk" min="1" max="10" required>
+            </div>
+            <div class="text-center">
+                <button type="submit" class="btn btn-primary">Predict Price</button>
+            </div>
+        </form>
+        {% if prediction is not none %}
+        <div class="prediction-result text-center">
+            <h4>Predicted Price:</h4>
+            <p class="h3">₹ {{ prediction }} Lakhs</p>
+            {% if property_details %}
+            <div class="property-details">
+                <h5>Property Details:</h5>
+                <ul class="list-unstyled">
+                    <li><strong>Location:</strong> {{ property_details.location }}</li>
+                    <li><strong>Area:</strong> {{ property_details.sqft }} sq.ft</li>
+                    <li><strong>Bathrooms:</strong> {{ property_details.bath }}</li>
+                    <li><strong>BHK:</strong> {{ property_details.bhk }}</li>
+                </ul>
+            </div>
+            {% endif %}
+        </div>
+        {% endif %}
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
+    <script>
+        // Form validation
+        (function () {
+            'use strict'
+            var forms = document.querySelectorAll('.needs-validation')
+            Array.prototype.slice.call(forms).forEach(function (form) {
+                form.addEventListener('submit', function (event) {
+                    if (!form.checkValidity()) {
+                        event.preventDefault()
+                        event.stopPropagation()
+                    }
+                    form.classList.add('was-validated')
+                }, false)
+            })
+        })()
+    </script>
+</body>
+</html>

templates/results.html ADDED Viewed

	@@ -0,0 +1,66 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>House Price Prediction Result</title>
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
+    <style>
+        body {
+            background-color: #f8f9fa;
+            padding: 20px;
+        }
+        .result-container {
+            max-width: 600px;
+            margin: 50px auto;
+            background-color: white;
+            padding: 30px;
+            border-radius: 10px;
+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
+        }
+        .price-display {
+            background-color: #e9ecef;
+            padding: 20px;
+            border-radius: 5px;
+            margin: 20px 0;
+            text-align: center;
+        }
+        .property-details {
+            margin: 20px 0;
+            padding: 15px;
+            border: 1px solid #dee2e6;
+            border-radius: 5px;
+        }
+        .back-button {
+            text-align: center;
+            margin-top: 20px;
+        }
+    </style>
+</head>
+<body>
+    <div class="result-container">
+        <h2 class="text-center mb-4">Price Prediction Result</h2>
+        <div class="price-display">
+            <h3 class="mb-3">Predicted Price</h3>
+            <h2 class="text-primary">₹ {{ predicted_price }} Lakhs</h2>
+        </div>
+        <div class="property-details">
+            <h4>Property Details:</h4>
+            <ul class="list-unstyled">
+                <li><strong>Location:</strong> {{ location }}</li>
+                <li><strong>Area:</strong> {{ sqft }} sq.ft</li>
+                <li><strong>Bathrooms:</strong> {{ bath }}</li>
+                <li><strong>BHK:</strong> {{ bhk }}</li>
+            </ul>
+        </div>
+        <div class="back-button">
+            <a href="/" class="btn btn-primary">Make Another Prediction</a>
+        </div>
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
+</body>
+</html>

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

tests/test2direct.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import pickle
+import numpy as np
+def load_model_and_features(model_path, feature_path):
+    """
+    Load the trained model and feature names from pickle files.
+    Args:
+        model_path (str): Path to the trained model pickle file.
+        feature_path (str): Path to the feature names pickle file.
+    Returns:
+        tuple: (trained model, feature names)
+    """
+    # Load the trained model
+    with open(model_path, "rb") as file:
+        model = pickle.load(file)
+    # Load the feature names
+    with open(feature_path, "rb") as file:
+        feature_names = pickle.load(file)
+    return model, feature_names
+def predict_price(location, sqft, bath, bhk, model, feature_names):
+    """
+    Predict the price using the trained model.
+    Args:
+        location (str): Location name.
+        sqft (float): Total square footage.
+        bath (int): Number of bathrooms.
+        bhk (int): Number of bedrooms.
+        model: Trained model object.
+        feature_names (list): List of feature names.
+    Returns:
+        float: Predicted price.
+    """
+    # Create an input array with zeros for all features
+    x = np.zeros(len(feature_names))
+    # Assign values for sqft, bath, and bhk
+    if 'total_sqft' in feature_names:
+        x[feature_names.index('total_sqft')] = sqft
+    if 'bath' in feature_names:
+        x[feature_names.index('bath')] = bath
+    if 'bhk' in feature_names:
+        x[feature_names.index('bhk')] = bhk
+    # Set the location column to 1 if it exists in feature names
+    if location in feature_names:
+        loc_index = feature_names.index(location)
+        x[loc_index] = 1
+    # Make prediction
+    return model.predict([x])[0]
+def main():
+    # Paths to the model and feature names
+    model_path = "models/lr_regg.pkl"
+    feature_path = "models/feature_names.pkl"
+    # Load the model and features
+    model, feature_names = load_model_and_features(model_path, feature_path)
+    # Test cases
+    test_cases = [
+        {"location": "Whitefield", "sqft": 1200, "bath": 2, "bhk": 2},
+        {"location": "Banaswadi", "sqft": 1500, "bath": 3, "bhk": 3},
+        {"location": "Basavangudi", "sqft": 1800, "bath": 3, "bhk": 4},
+        {"location": "Nonexistent Location", "sqft": 1000, "bath": 2, "bhk": 3},
+        {"location": "Electronic City Phase II", "sqft": 1056, "bath": 2, "bhk": 2},
+        {"location": "Chikka Tirupathi", "sqft": 800, "bath": 2, "bhk": 2}
+    ]
+    print("\nPredictions:")
+    for case in test_cases:
+        location = case["location"]
+        sqft = case["sqft"]
+        bath = case["bath"]
+        bhk = case["bhk"]
+        try:
+            predicted_price = predict_price(location, sqft, bath, bhk, model, feature_names)
+            print(f"Location: {location}, Sqft: {sqft}, Bath: {bath}, BHK: {bhk} -> Predicted Price: {predicted_price/10:.0f} lakhs")
+        except Exception as e:
+            print(f"Prediction failed for Location: {location}, Error: {e}")
+if __name__ == "__main__":
+    main()

tests/test_model.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import pickle
+import numpy as np
+# Load model and feature names
+def load_model_and_features(model_path, feature_path):
+    # Load the trained model
+    with open(model_path, "rb") as file:
+        model = pickle.load(file)
+    # Load the feature names
+    with open(feature_path, "rb") as file:
+        feature_names = pickle.load(file)
+    return model, feature_names
+# Predict price using the model
+def predict_price(location, sqft, bath, bhk, model, feature_names):
+    # Create an input array with zeros for all features
+    x = np.zeros(len(feature_names))
+    # Assign values for sqft, bath, and bhk
+    if 'total_sqft' in feature_names:
+        x[feature_names.index('total_sqft')] = sqft
+    if 'bath' in feature_names:
+        x[feature_names.index('bath')] = bath
+    if 'bhk' in feature_names:
+        x[feature_names.index('bhk')] = bhk
+    # Set the location column to 1 if it exists in feature names
+    if location in feature_names:
+        loc_index = feature_names.index(location)
+        x[loc_index] = 1
+    # Make prediction
+    return model.predict([x])[0]
+# Test function
+def test_house_price_predictions():
+    # Paths to the model and feature names
+    model_path = "models/lr_regg.pkl"
+    feature_path = "models/feature_names.pkl"
+    # Load the model and features
+    model, feature_names = load_model_and_features(model_path, feature_path)
+    # Test cases and expected outputs
+    test_cases = [
+        {"location": "Whitefield", "sqft": 1200, "bath": 2, "bhk": 2, "expected": 94},
+        {"location": "Banaswadi", "sqft": 1500, "bath": 3, "bhk": 3, "expected": 118},
+        {"location": "Basavangudi", "sqft": 1800, "bath": 3, "bhk": 4, "expected": 142},
+        {"location": "Nonexistent Location", "sqft": 1000, "bath": 2, "bhk": 3, "expected": 79},
+        {"location": "Electronic City Phase II", "sqft": 1056, "bath": 2, "bhk": 2, "expected": 83},
+        {"location": "Chikka Tirupathi", "sqft": 800, "bath": 2, "bhk": 2, "expected": 63}
+    ]
+    # Run predictions and validate against expected outputs
+    for case in test_cases:
+        location = case["location"]
+        sqft = case["sqft"]
+        bath = case["bath"]
+        bhk = case["bhk"]
+        expected = case["expected"]
+        try:
+            predicted_price = predict_price(location, sqft, bath, bhk, model, feature_names)
+            assert round(predicted_price / 10) == expected, (
+                f"Failed for Location: {location}, "
+                f"Expected: {expected}, Got: {predicted_price/10:.0f} lakhs"
+            )
+            print(f"Test Passed: Location: {location}, Predicted: {predicted_price/10:.0f} lakhs")
+        except Exception as e:
+            print(f"Prediction failed for Location: {location}, Error: {e}")
+# Run the tests
+if __name__ == "__main__":
+    test_house_price_predictions()