File size: 3,173 Bytes
825e978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import pandas as pd
import ast
import google.generativeai as genai
from dotenv import load_dotenv

load_dotenv()

def extract_column_samples(df, n=5):
    samples = {}
    for col in df.columns:
        samples[col] = df[col].head(n).tolist()
    return samples

def getCodes(query):
    # query = "covid 19"
    path="final/"+query+".csv"
    df = pd.read_csv(path)

    samples = extract_column_samples(df)

    prompt = (
    "You are a data analyst. I will give you a dictionary containing column names with example values from a dataset.\n\n"
    "Your task is to:\n"
    "1. Identify columns where one-hot encoding is *not suitable*.\n"
    "2. For each of these, determine if it requires:\n"
    "   - feature extraction (e.g., from datetime or strings), or\n"
    "   - use of word embeddings (e.g., for free text or high-cardinality text).\n\n"
    "For feature extraction columns:\n"
    "- Create a **Python dictionary** where:\n"
    "  * Each key is a new, meaningful column name.\n"
    "  * Each value is a **valid Pandas expression string** that derives the new column from the original `df` DataFrame.\n"
    "- Also return a **Python list** of original column names that were used in this dictionary.\n\n"
    "For columns requiring word embeddings:\n"
    "- Return a separate **Python list** of these column names.\n"
    "- If any column appears in both cases, include it *only* in the word embedding list.\n\n"
    "Your output **must follow this exact format** with no additional explanation or markdown. Only return the following inside a single Python code block:\n"
    "```python\n"
    "# Dictionary of transformations\n"
    "{'new_col1': \"some pandas expression\", 'new_col2': \"some other pandas expression\"}\n\n"
    "# Array of columns used in the dictionary\n"
    "['col1', 'col2']\n"
    "# Array of columns that require the use of word embeddings\n"
    "['col3', 'col4']\n"
    "```\n\n"
    "**DO NOT** include any explanation, reasoning, extra code, or markdown outside of the code block. Only return the exact format shown above. Do not generate or describe functions.\n\n"
    f"Here is the input :\n{samples}\n"
)


    genai.configure(api_key=os.getenv("gemini_api"))

    model = genai.GenerativeModel("gemini-2.0-flash")
    response = model.generate_content(prompt)

    merge_map_text = response.text.strip()
    print(merge_map_text)

    str1 = merge_map_text.split("```python")[1].split("# Array of columns used in the dictionary")[0].strip()
    str2 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[0].strip()
    str3 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[1].replace("```","").strip()

    preprocessing_code = ast.literal_eval(str1)
    actual_list = ast.literal_eval(str2)
    nlp=ast.literal_eval(str3)
    # print("Parsed dict:\n", preprocessing_code)
    # print("Columns changed:\n", actual_list)
    # print("for nlp : ",nlp)
    return preprocessing_code,actual_list,nlp

# getCodes(extract_column_samples)