Maaz Uddin
allfilesupload
e0a433a
raw
history blame contribute delete
4.03 kB
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
class EDA:
def __init__(self, data_path):
"""Initialize with dataset path."""
self.data_path = data_path
self.data = None
def load_data(self):
"""Loads the dataset from the provided path."""
self.data = pd.read_csv(self.data_path)
return self.data
def basic_info(self):
"""Displays basic information about the dataset."""
print("\nDataset Info:\n")
print(self.data.info())
print("\nShape:", self.data.shape)
print("\nMissing Values:\n", self.data.isnull().sum())
print("\nDuplicate Rows:", self.data.duplicated().sum())
return self.data.describe()
def missing_value_analysis(self):
"""Analyzes and visualizes missing values."""
missing_data = self.data.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
if not missing_data.empty:
plt.figure(figsize=(8, 6))
sns.barplot(x=missing_data.index, y=missing_data.values, palette='viridis')
plt.title('Missing Values Count')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()
return missing_data
def visualize_distributions(self):
"""Visualizes distributions of numerical features."""
numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
self.data[numeric_cols].hist(bins=15, figsize=(10, 8), color='skyblue', edgecolor='black')
plt.suptitle('Feature Distributions', fontsize=16)
plt.show()
def correlation_heatmap(self):
"""Plots a heatmap of feature correlations."""
plt.figure(figsize=(10, 8))
sns.heatmap(self.data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.show()
def detect_outliers(self, feature):
"""Detects and visualizes outliers for a given feature."""
plt.figure(figsize=(8, 6))
sns.boxplot(x=self.data[feature], color='lightblue')
plt.title(f'Outliers in {feature}')
plt.show()
def feature_summary(self):
"""Provides a summary of categorical and numerical features."""
categorical_cols = self.data.select_dtypes(include=['object']).columns
numeric_cols = self.data.select_dtypes(include=['int64', 'float64']).columns
print("\nCategorical Features:")
for col in categorical_cols:
print(f"{col}: {self.data[col].nunique()} unique values")
print(self.data[col].value_counts().head(10))
print("---")
print("\nNumerical Features:")
for col in numeric_cols:
print(f"{col}: Mean={self.data[col].mean()}, Median={self.data[col].median()}, Std={self.data[col].std()}")
print("---")
def pairwise_scatterplots(self, features):
"""Plots scatterplots for selected features."""
sns.pairplot(self.data[features], diag_kind='kde', plot_kws={'alpha': 0.5})
plt.suptitle('Pairwise Scatterplots', fontsize=16)
plt.show()
def target_analysis(self, target_col):
"""Analyzes target variable distribution."""
plt.figure(figsize=(8, 6))
sns.histplot(self.data[target_col], kde=True, bins=30, color='blue')
plt.title(f'Distribution of {target_col}')
plt.xlabel(target_col)
plt.ylabel('Frequency')
plt.show()
if __name__ == "__main__":
eda = EDA(data_path="data/bengaluru_house_prices.csv")
data = eda.load_data()
eda.basic_info()
eda.missing_value_analysis()
eda.visualize_distributions()
eda.correlation_heatmap()
eda.detect_outliers('price')
eda.feature_summary()
eda.pairwise_scatterplots(features=['price', 'total_sqft', 'bath', 'bhk'])
eda.target_analysis(target_col='price')
print("Missing values summary:")
print(eda.missing_value_analysis())