import pandas as pd import getString import getNLP import os def one_hot_encode_objects(df,nlp,columns): object_cols = df.select_dtypes(include='object').columns for col in object_cols: if col not in nlp and col not in columns and "label" not in col: if df[col].apply(lambda x: isinstance(x, (list, tuple, dict, set)) or hasattr(x, '__array__')).any(): print(f"Skipping column '{col}' due to unhashable values.") continue dummies = pd.get_dummies(df[col], prefix=col).astype(int) df = pd.concat([df, dummies], axis=1) df = df.drop(columns=[col]) print(df.columns) return df def fixEmpty(df): df.replace(['undefined', 'null', 'NaN', 'None'], pd.NA, inplace=True) for col in df.columns: if df[col].dtype == 'object': df[col] = df[col].fillna('Unknown') else: df[col] = df[col].fillna(df[col].mean()) return df def preprocessing(query): os.makedirs("./processed",exist_ok=True) df=pd.read_csv("final/"+query+".csv") # print(df.head()) df=fixEmpty(df) preDict,col,nlp=getString.getCodes(query) if len(col)>0: for new_col, expr in preDict.items(): df[new_col] = eval(expr) df.drop(columns=col, inplace=True) if len(nlp)>0: df=getNLP.wordEmbed(df,nlp) # print(df.columns) df=one_hot_encode_objects(df,nlp,col) # df = df.astype('float32') df.to_csv("./processed/"+query+".csv", index=False) # print(df.head()) # print(df.info()) # preprocessing("twitter sentiment analysis")