1. seprate attributes and data 2. remove the datatypes from the attributes C:\Users\Niall Dcunha\DatasetCreator\house price prediction\21754539_dataset # import os # import glob # import pandas as pd # import openai # from openai import OpenAI # from dotenv import load_dotenv # import ast # import re # def extract_dict_from_response(response: str) -> dict: # # Try extracting code block content containing the dictionary # match = re.search(r"```(?:python)?\s*(\{.*?\})\s*```", response, re.DOTALL) # if match: # mapping_str = match.group(1) # else: # # Try extracting dictionary directly if it's not in code block # match = re.search(r"(\{.*\})", response, re.DOTALL) # if not match: # raise ValueError("āŒ Could not find a Python dictionary in the response.") # mapping_str = match.group(1) # try: # return ast.literal_eval(mapping_str) # except Exception as e: # print("āš ļø Failed to evaluate extracted dictionary string.") # print("String:", mapping_str) # raise e # # Load environment variables # load_dotenv() # client = OpenAI( # api_key=os.getenv("OPENAI_API_KEY"), # base_url=os.getenv("OPENAI_API_BASE") # Optional: for Azure or self-hosted # ) # def load_csv_files(folder_path): # csv_files = glob.glob(os.path.join(folder_path, "*.csv")) # dataframes = [] # column_sets = [] # valid_paths = [] # print("šŸ“„ Reading CSV files...") # for file in csv_files: # try: # df = pd.read_csv(file) # dataframes.append(df) # column_sets.append(list(df.columns)) # valid_paths.append(file) # print(f"āœ… Loaded: {os.path.basename(file)}") # except pd.errors.ParserError as e: # print(f"āŒ Skipping file due to parsing error: {os.path.basename(file)}") # print(f" ↳ {e}") # except Exception as e: # print(f"āš ļø Unexpected error with file {os.path.basename(file)}: {e}") # return dataframes, column_sets, valid_paths # def generate_mapping_prompt(column_sets): # prompt = ( # "You are a data scientist helping to merge multiple ML prediction datasets. " # "Each CSV may have different or similar column names. I need a unified mapping to standardize these datasets. " # "Also, please identify likely prediction label columns (e.g., price, quality, outcome).\n\n" # "Here are the column headers from each CSV:\n" # ) # for i, columns in enumerate(column_sets): # prompt += f"CSV {i+1}: {columns}\n" # prompt += ( # "\nPlease provide:\n" # "1. A Python dictionary mapping similar columns across these CSVs.\n" # "2. A list of columns most likely to represent prediction labels.\n\n" # "Format your response as:\n" # "```python\n" # "column_mapping = { ... }\n" # "label_columns = [ ... ]\n" # "```" # ) # return prompt # def get_column_mapping_from_openai(column_sets): # prompt = generate_mapping_prompt(column_sets) # response = client.chat.completions.create( # model="gpt-4", # messages=[ # {"role": "system", "content": "You are a helpful data scientist."}, # {"role": "user", "content": prompt} # ], # temperature=0.3 # ) # content = response.choices[0].message.content # print("\nšŸ“© Received response from OpenAI.") # try: # # Try parsing both dictionary and label list from the response # column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL) # label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL) # if column_mapping_match: # mapping = ast.literal_eval(column_mapping_match.group(1)) # else: # raise ValueError("āŒ Could not find `column_mapping` in the response.") # if label_columns_match: # label_columns = ast.literal_eval(label_columns_match.group(1)) # else: # label_columns = [] # except Exception as e: # print("āš ļø Error parsing OpenAI response:") # print(content) # raise e # return mapping, label_columns # def standardize_columns(df, mapping): # new_columns = {col: mapping.get(col, col) for col in df.columns} # return df.rename(columns=new_columns) # def merge_csvs(folder_path, output_file="merged_dataset.csv"): # dfs, column_sets, csv_paths = load_csv_files(folder_path) # if not dfs: # print("āŒ No valid CSVs found to merge.") # return # print("\n🧠 Requesting column mapping from OpenAI...") # mapping, label_columns = get_column_mapping_from_openai(column_sets) # print("\nšŸ“Œ Column Mapping:") # for k, v in mapping.items(): # print(f" '{k}' -> '{v}'") # print("\nšŸ·ļø Suggested Label Columns:") # for label in label_columns: # print(f" - {label}") # standardized_dfs = [standardize_columns(df, mapping) for df in dfs] # merged_df = pd.concat(standardized_dfs, ignore_index=True, sort=False) # merged_df.to_csv(output_file, index=False) # print(f"\nāœ… Merged dataset saved as '{output_file}'") # if __name__ == "__main__": # folder_path = "house" import os import glob import pandas as pd import ast import re from itertools import combinations from rapidfuzz import fuzz, process from dotenv import load_dotenv from openai import OpenAI # Manual rename map to standardize some known variations manual_rename_map = { "review": "text", "text": "text", "NumBedrooms": "bedrooms", "HousePrice": "price", "TARGET(PRICE_IN_LACS)": "price", "SquareFootage": "area", "SQUARE_FT": "area", "sentiment": "label", "target": "label", "type": "label", "variety": "label", "class": "label", "HeartDisease": "label", "Heart Attack Risk (Binary)": "label", "Heart Attack Risk": "label" } def normalize(col): return re.sub(r'[^a-z0-9]', '', col.lower()) def apply_manual_renaming(df, rename_map): renamed = {} for col in df.columns: if col in rename_map: renamed[col] = rename_map[col] return df.rename(columns=renamed) def get_fuzzy_common_columns(cols_list, threshold=75): base = cols_list[0] common = set() for col in base: match_all = True for other in cols_list[1:]: match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio) if score < threshold: match_all = False break if match_all: common.add(col) return common def sortFiles(dfs): unique_dfs = [] seen = [] for i, df1 in enumerate(dfs): duplicate = False for j in seen: df2 = dfs[j] if df1.shape != df2.shape: continue if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)): duplicate = True break if not duplicate: unique_dfs.append(df1) seen.append(i) return unique_dfs def load_csv_files(folder_path): csv_files = glob.glob(os.path.join(folder_path, "*.csv")) dfs = [] column_sets = [] paths = [] for file in csv_files: try: df = pd.read_csv(file) dfs.append(df) column_sets.append(list(df.columns)) paths.append(file) print(f"āœ… Loaded: {os.path.basename(file)}") except Exception as e: print(f"āŒ Failed to load {file}: {e}") return dfs, column_sets, paths def generate_mapping_prompt(column_sets): prompt = ( "You are a data scientist helping to merge multiple machine learning prediction datasets. " "Each CSV file may have different column names, even if they represent similar types of data. " "Your task is to identify and map these similar columns across datasets to a common, unified name. " "Columns with clearly similar features (e.g., 'Bedrooms' and 'BedroomsAbvGr') should be merged into one column with a relevant name like 'bedrooms'.\n\n" "Avoid keeping redundant or unique columns that do not have any logical counterpart in other datasets unless they are essential. " "The goal is not to maximize the number of columns or rows, but to create a clean, consistent dataset for training ML models.\n\n" "Examples:\n" "- Dataset1: 'Locality' -> Mumbai, Delhi\n" "- Dataset2: 'Places' -> Goa, Singapore\n" "→ Merge both into a common column like 'location'.\n\n" "Please also identify likely label or target columns that are typically used for prediction (e.g., price, sentiment, outcome, quality).\n\n" ) for i, cols in enumerate(column_sets): prompt += f"CSV {i+1}: {cols}\n" prompt += "\nPlease return:\n```python\ncolumn_mapping = { ... }\nlabel_columns = [ ... ]\n```" return prompt def get_column_mapping_from_openai(column_sets): load_dotenv() client = OpenAI( api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE", "") ) prompt = generate_mapping_prompt(column_sets) response = client.chat.completions.create( model="gpt-4", messages=[ {"role": "system", "content": "You are a helpful data scientist."}, {"role": "user", "content": prompt} ], temperature=0.3 ) content = response.choices[0].message.content try: column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL) label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL) column_mapping = ast.literal_eval(column_mapping_match.group(1)) if column_mapping_match else {} label_columns = ast.literal_eval(label_columns_match.group(1)) if label_columns_match else [] except Exception as e: print("āš ļø Error parsing OpenAI response:") print(content) raise e return column_mapping, label_columns def clean_and_merge(folder, query=None, use_ai=True): os.makedirs("./final", exist_ok=True) dfs, column_sets, csv_paths = load_csv_files(folder) if not dfs: print("No valid CSVs found.") return dfs = sortFiles(dfs) dfs = [apply_manual_renaming(df, manual_rename_map) for df in dfs] if use_ai: try: column_mapping, label_columns = get_column_mapping_from_openai(column_sets) dfs = [df.rename(columns={col: column_mapping.get(col, col) for col in df.columns}) for df in dfs] except Exception as e: print("Falling back to fuzzy matching due to OpenAI error:", e) use_ai = False if not use_ai: # Normalize columns for fuzzy match fallback normalized_cols = [] for df in dfs: normalized_cols.append({normalize(col) for col in df.columns}) # Get best combination with fuzzy common columns max_common = set() best_combo = [] for i in range(2, len(dfs)+1): for combo in combinations(range(len(dfs)), i): selected = [normalized_cols[j] for j in combo] fuzzy_common = get_fuzzy_common_columns(selected) if len(fuzzy_common) >= len(max_common): max_common = fuzzy_common best_combo = combo # Harmonize and align aligned_dfs = [] for idx in best_combo: df = dfs[idx] col_map = {} for std_col in max_common: match, _, _ = process.extractOne(std_col, [normalize(col) for col in df.columns]) for col in df.columns: if normalize(col) == match: col_map[col] = std_col break df_subset = df[list(col_map.keys())].rename(columns=col_map) aligned_dfs.append(df_subset) combined_df = pd.concat(aligned_dfs, ignore_index=True) else: combined_df = pd.concat(dfs, ignore_index=True) # Label assignment fallback for i, df in enumerate(dfs): if 'label' not in df.columns: name = os.path.basename(csv_paths[i]).split(".")[0].lower() name_cleaned = name if query: words = set(re.sub(r'[^a-z]', ' ', query.lower()).split()) for word in words: name_cleaned = name_cleaned.replace(word, "") df['label'] = name_cleaned # Decide best final file largest_df = max(dfs, key=lambda df: len(df)) flag = False if len(largest_df) > len(combined_df) and len(largest_df.columns) > 2: flag = True elif len(combined_df) > len(largest_df) and (len(largest_df.columns) - len(combined_df.columns)) > 3 and len(largest_df.columns) < 7: flag = True output_file = f"./final/{query or os.path.basename(folder)}.csv" if flag: largest_df.to_csv(output_file, index=False) print(f"āš ļø Saved fallback single file due to poor merge: {output_file}") else: combined_df.to_csv(output_file, index=False) print(f"āœ… Saved merged file: {output_file}") # Example usage: clean_and_merge("house", query="house", use_ai=True) # merge_csvs(folder_path)