|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
class EDA: |
|
def __init__(self, data_path): |
|
"""Initialize with dataset path.""" |
|
self.data_path = data_path |
|
self.data = None |
|
|
|
def load_data(self): |
|
"""Loads the dataset from the provided path.""" |
|
self.data = pd.read_csv(self.data_path) |
|
return self.data |
|
|
|
def basic_info(self): |
|
"""Displays basic information about the dataset.""" |
|
print("\nDataset Info:\n") |
|
print(self.data.info()) |
|
print("\nShape:", self.data.shape) |
|
print("\nMissing Values:\n", self.data.isnull().sum()) |
|
print("\nDuplicate Rows:", self.data.duplicated().sum()) |
|
return self.data.describe() |
|
|
|
def missing_value_analysis(self): |
|
"""Analyzes and visualizes missing values.""" |
|
missing_data = self.data.isnull().sum() |
|
missing_data = missing_data[missing_data > 0].sort_values(ascending=False) |
|
|
|
if not missing_data.empty: |
|
plt.figure(figsize=(8, 6)) |
|
sns.barplot(x=missing_data.index, y=missing_data.values, palette='viridis') |
|
plt.title('Missing Values Count') |
|
plt.xticks(rotation=45) |
|
plt.ylabel('Count') |
|
plt.show() |
|
|
|
return missing_data |
|
|
|
def visualize_distributions(self): |
|
"""Visualizes distributions of numerical features.""" |
|
numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns |
|
self.data[numeric_cols].hist(bins=15, figsize=(10, 8), color='skyblue', edgecolor='black') |
|
plt.suptitle('Feature Distributions', fontsize=16) |
|
plt.show() |
|
|
|
def correlation_heatmap(self): |
|
"""Plots a heatmap of feature correlations.""" |
|
plt.figure(figsize=(10, 8)) |
|
sns.heatmap(self.data.corr(), annot=True, cmap='coolwarm', fmt='.2f') |
|
plt.title('Feature Correlation Heatmap') |
|
plt.show() |
|
|
|
def detect_outliers(self, feature): |
|
"""Detects and visualizes outliers for a given feature.""" |
|
plt.figure(figsize=(8, 6)) |
|
sns.boxplot(x=self.data[feature], color='lightblue') |
|
plt.title(f'Outliers in {feature}') |
|
plt.show() |
|
|
|
def feature_summary(self): |
|
"""Provides a summary of categorical and numerical features.""" |
|
categorical_cols = self.data.select_dtypes(include=['object']).columns |
|
numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns |
|
|
|
print("\nCategorical Features:") |
|
for col in categorical_cols: |
|
print(f"{col}: {self.data[col].nunique()} unique values") |
|
print(self.data[col].value_counts().head(10)) |
|
print("---") |
|
|
|
print("\nNumerical Features:") |
|
for col in numeric_cols: |
|
print(f"{col}: Mean={self.data[col].mean()}, Median={self.data[col].median()}, Std={self.data[col].std()}") |
|
print("---") |
|
|
|
def pairwise_scatterplots(self, features): |
|
"""Plots scatterplots for selected features.""" |
|
sns.pairplot(self.data[features], diag_kind='kde', plot_kws={'alpha': 0.5}) |
|
plt.suptitle('Pairwise Scatterplots', fontsize=16) |
|
plt.show() |
|
|
|
def target_analysis(self, target_col): |
|
"""Analyzes target variable distribution.""" |
|
plt.figure(figsize=(8, 6)) |
|
sns.histplot(self.data[target_col], kde=True, bins=30, color='blue') |
|
plt.title(f'Distribution of {target_col}') |
|
plt.xlabel(target_col) |
|
plt.ylabel('Frequency') |
|
plt.show() |
|
|
|
|
|
if __name__ == "__main__": |
|
eda = EDA(data_path="data/bengaluru_house_prices.csv") |
|
data = eda.load_data() |
|
eda.basic_info() |
|
eda.missing_value_analysis() |
|
eda.visualize_distributions() |
|
eda.correlation_heatmap() |
|
eda.detect_outliers('price') |
|
eda.feature_summary() |
|
eda.pairwise_scatterplots(features=['price', 'total_sqft', 'bath', 'bhk']) |
|
eda.target_analysis(target_col='price') |
|
print("Missing values summary:") |
|
print(eda.missing_value_analysis()) |