File size: 1,628 Bytes
825e978 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import pandas as pd
import getString
import getNLP
import os
def one_hot_encode_objects(df,nlp,columns):
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
if col not in nlp and col not in columns and "label" not in col:
if df[col].apply(lambda x: isinstance(x, (list, tuple, dict, set)) or hasattr(x, '__array__')).any():
print(f"Skipping column '{col}' due to unhashable values.")
continue
dummies = pd.get_dummies(df[col], prefix=col).astype(int)
df = pd.concat([df, dummies], axis=1)
df = df.drop(columns=[col])
print(df.columns)
return df
def fixEmpty(df):
df.replace(['undefined', 'null', 'NaN', 'None'], pd.NA, inplace=True)
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].fillna('Unknown')
else:
df[col] = df[col].fillna(df[col].mean())
return df
def preprocessing(query):
os.makedirs("./processed",exist_ok=True)
df=pd.read_csv("final/"+query+".csv")
# print(df.head())
df=fixEmpty(df)
preDict,col,nlp=getString.getCodes(query)
if len(col)>0:
for new_col, expr in preDict.items():
df[new_col] = eval(expr)
df.drop(columns=col, inplace=True)
if len(nlp)>0:
df=getNLP.wordEmbed(df,nlp)
# print(df.columns)
df=one_hot_encode_objects(df,nlp,col)
# df = df.astype('float32')
df.to_csv("./processed/"+query+".csv", index=False)
# print(df.head())
# print(df.info())
# preprocessing("twitter sentiment analysis") |