Dataset / preprocessing /getString.py
vansh9878's picture
files added
825e978
import os
import pandas as pd
import ast
import google.generativeai as genai
from dotenv import load_dotenv
load_dotenv()
def extract_column_samples(df, n=5):
samples = {}
for col in df.columns:
samples[col] = df[col].head(n).tolist()
return samples
def getCodes(query):
# query = "covid 19"
path="final/"+query+".csv"
df = pd.read_csv(path)
samples = extract_column_samples(df)
prompt = (
"You are a data analyst. I will give you a dictionary containing column names with example values from a dataset.\n\n"
"Your task is to:\n"
"1. Identify columns where one-hot encoding is *not suitable*.\n"
"2. For each of these, determine if it requires:\n"
" - feature extraction (e.g., from datetime or strings), or\n"
" - use of word embeddings (e.g., for free text or high-cardinality text).\n\n"
"For feature extraction columns:\n"
"- Create a **Python dictionary** where:\n"
" * Each key is a new, meaningful column name.\n"
" * Each value is a **valid Pandas expression string** that derives the new column from the original `df` DataFrame.\n"
"- Also return a **Python list** of original column names that were used in this dictionary.\n\n"
"For columns requiring word embeddings:\n"
"- Return a separate **Python list** of these column names.\n"
"- If any column appears in both cases, include it *only* in the word embedding list.\n\n"
"Your output **must follow this exact format** with no additional explanation or markdown. Only return the following inside a single Python code block:\n"
"```python\n"
"# Dictionary of transformations\n"
"{'new_col1': \"some pandas expression\", 'new_col2': \"some other pandas expression\"}\n\n"
"# Array of columns used in the dictionary\n"
"['col1', 'col2']\n"
"# Array of columns that require the use of word embeddings\n"
"['col3', 'col4']\n"
"```\n\n"
"**DO NOT** include any explanation, reasoning, extra code, or markdown outside of the code block. Only return the exact format shown above. Do not generate or describe functions.\n\n"
f"Here is the input :\n{samples}\n"
)
genai.configure(api_key=os.getenv("gemini_api"))
model = genai.GenerativeModel("gemini-2.0-flash")
response = model.generate_content(prompt)
merge_map_text = response.text.strip()
print(merge_map_text)
str1 = merge_map_text.split("```python")[1].split("# Array of columns used in the dictionary")[0].strip()
str2 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[0].strip()
str3 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[1].replace("```","").strip()
preprocessing_code = ast.literal_eval(str1)
actual_list = ast.literal_eval(str2)
nlp=ast.literal_eval(str3)
# print("Parsed dict:\n", preprocessing_code)
# print("Columns changed:\n", actual_list)
# print("for nlp : ",nlp)
return preprocessing_code,actual_list,nlp
# getCodes(extract_column_samples)