|
import os |
|
import pandas as pd |
|
import ast |
|
import google.generativeai as genai |
|
from dotenv import load_dotenv |
|
|
|
load_dotenv() |
|
|
|
def extract_column_samples(df, n=5): |
|
samples = {} |
|
for col in df.columns: |
|
samples[col] = df[col].head(n).tolist() |
|
return samples |
|
|
|
def getCodes(query): |
|
|
|
path="final/"+query+".csv" |
|
df = pd.read_csv(path) |
|
|
|
samples = extract_column_samples(df) |
|
|
|
prompt = ( |
|
"You are a data analyst. I will give you a dictionary containing column names with example values from a dataset.\n\n" |
|
"Your task is to:\n" |
|
"1. Identify columns where one-hot encoding is *not suitable*.\n" |
|
"2. For each of these, determine if it requires:\n" |
|
" - feature extraction (e.g., from datetime or strings), or\n" |
|
" - use of word embeddings (e.g., for free text or high-cardinality text).\n\n" |
|
"For feature extraction columns:\n" |
|
"- Create a **Python dictionary** where:\n" |
|
" * Each key is a new, meaningful column name.\n" |
|
" * Each value is a **valid Pandas expression string** that derives the new column from the original `df` DataFrame.\n" |
|
"- Also return a **Python list** of original column names that were used in this dictionary.\n\n" |
|
"For columns requiring word embeddings:\n" |
|
"- Return a separate **Python list** of these column names.\n" |
|
"- If any column appears in both cases, include it *only* in the word embedding list.\n\n" |
|
"Your output **must follow this exact format** with no additional explanation or markdown. Only return the following inside a single Python code block:\n" |
|
"```python\n" |
|
"# Dictionary of transformations\n" |
|
"{'new_col1': \"some pandas expression\", 'new_col2': \"some other pandas expression\"}\n\n" |
|
"# Array of columns used in the dictionary\n" |
|
"['col1', 'col2']\n" |
|
"# Array of columns that require the use of word embeddings\n" |
|
"['col3', 'col4']\n" |
|
"```\n\n" |
|
"**DO NOT** include any explanation, reasoning, extra code, or markdown outside of the code block. Only return the exact format shown above. Do not generate or describe functions.\n\n" |
|
f"Here is the input :\n{samples}\n" |
|
) |
|
|
|
|
|
genai.configure(api_key=os.getenv("gemini_api")) |
|
|
|
model = genai.GenerativeModel("gemini-2.0-flash") |
|
response = model.generate_content(prompt) |
|
|
|
merge_map_text = response.text.strip() |
|
print(merge_map_text) |
|
|
|
str1 = merge_map_text.split("```python")[1].split("# Array of columns used in the dictionary")[0].strip() |
|
str2 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[0].strip() |
|
str3 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[1].replace("```","").strip() |
|
|
|
preprocessing_code = ast.literal_eval(str1) |
|
actual_list = ast.literal_eval(str2) |
|
nlp=ast.literal_eval(str3) |
|
|
|
|
|
|
|
return preprocessing_code,actual_list,nlp |
|
|
|
|
|
|
|
|