Spaces:

vansh9878
/

Dataset

Runtime error

App Files Files Community

Dataset / preprocessing /getString.py

vansh9878

files added

825e978 9 days ago

raw

history blame contribute delete

3.17 kB

	import os
	import pandas as pd
	import ast
	import google.generativeai as genai
	from dotenv import load_dotenv

	load_dotenv()

	def extract_column_samples(df, n=5):
	samples = {}
	for col in df.columns:
	samples[col] = df[col].head(n).tolist()
	return samples

	def getCodes(query):
	# query = "covid 19"
	path="final/"+query+".csv"
	df = pd.read_csv(path)

	samples = extract_column_samples(df)

	prompt = (
	"You are a data analyst. I will give you a dictionary containing column names with example values from a dataset.\n\n"
	"Your task is to:\n"
	"1. Identify columns where one-hot encoding is not suitable.\n"
	"2. For each of these, determine if it requires:\n"
	" - feature extraction (e.g., from datetime or strings), or\n"
	" - use of word embeddings (e.g., for free text or high-cardinality text).\n\n"
	"For feature extraction columns:\n"
	"- Create a Python dictionary where:\n"
	" * Each key is a new, meaningful column name.\n"
	" * Each value is a valid Pandas expression string that derives the new column from the original `df` DataFrame.\n"
	"- Also return a Python list of original column names that were used in this dictionary.\n\n"
	"For columns requiring word embeddings:\n"
	"- Return a separate Python list of these column names.\n"
	"- If any column appears in both cases, include it only in the word embedding list.\n\n"
	"Your output must follow this exact format with no additional explanation or markdown. Only return the following inside a single Python code block:\n"
	"```python\n"
	"# Dictionary of transformations\n"
	"{'new_col1': \"some pandas expression\", 'new_col2': \"some other pandas expression\"}\n\n"
	"# Array of columns used in the dictionary\n"
	"['col1', 'col2']\n"
	"# Array of columns that require the use of word embeddings\n"
	"['col3', 'col4']\n"
	"```\n\n"
	"DO NOT include any explanation, reasoning, extra code, or markdown outside of the code block. Only return the exact format shown above. Do not generate or describe functions.\n\n"
	f"Here is the input :\n{samples}\n"
	)


	genai.configure(api_key=os.getenv("gemini_api"))

	model = genai.GenerativeModel("gemini-2.0-flash")
	response = model.generate_content(prompt)

	merge_map_text = response.text.strip()
	print(merge_map_text)

	str1 = merge_map_text.split("```python")[1].split("# Array of columns used in the dictionary")[0].strip()
	str2 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[0].strip()
	str3 = merge_map_text.split("# Array of columns used in the dictionary")[1].split("# Array of columns that require the use of word embeddings")[1].replace("```","").strip()

	preprocessing_code = ast.literal_eval(str1)
	actual_list = ast.literal_eval(str2)
	nlp=ast.literal_eval(str3)
	# print("Parsed dict:\n", preprocessing_code)
	# print("Columns changed:\n", actual_list)
	# print("for nlp : ",nlp)
	return preprocessing_code,actual_list,nlp

	# getCodes(extract_column_samples)