import gradio as gr
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModelForCausalLM
from io import StringIO
import os
from huggingface_hub import HfFolder

# Access the Hugging Face API token from environment variables
hf_token = os.getenv('HF_API_TOKEN')

if not hf_token:
    raise ValueError("Hugging Face API token is not set. Please set the HF_API_TOKEN environment variable.")
HfFolder.save_token(hf_token)
# Load the GPT-2 tokenizer and model
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

# Load the Llama3 model in sharded mode
model_name = "meta-llama/Meta-Llama-3.1-8B"
try:
    model_llama = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token = hf_token)  # use device_map for automatic sharding
except OSError as e:
    print(f"Error loading model: {e}")

# Define your prompt template
prompt_template = """\
You are an expert in generating synthetic data for machine learning models.
Your task is to generate a synthetic tabular dataset based on the description provided below.
Description: {description}
The dataset should include the following columns: {columns}
Please provide the data in CSV format with a minimum of 100 rows per generation.
Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
Example Description:
Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
Example Output:
Size,Location,Number of Bedrooms,Price
1200,Suburban,3,250000
900,Urban,2,200000
1500,Rural,4,300000
...
Description:
{description}
Columns:
{columns}
Output: """

def preprocess_user_prompt(user_prompt):
    generated_text = model_gpt2.generate(tokenizer_gpt2.encode(user_prompt, return_tensors='pt'), max_length=60)[0]
    return tokenizer_gpt2.decode(generated_text, skip_special_tokens=True)

def format_prompt(description, columns):
    processed_description = preprocess_user_prompt(description)
    prompt = prompt_template.format(description=processed_description, columns=",".join(columns))
    return prompt

def generate_synthetic_data(description, columns):
    try:
        formatted_prompt = format_prompt(description, columns)
        inputs = tokenizer_llama(formatted_prompt, return_tensors="pt")
        generated_output = model_llama.generate(**inputs, max_new_tokens=512)
        generated_text = tokenizer_llama.decode(generated_output[0], skip_special_tokens=True)
        return generated_text
    except Exception as e:
        print(f"Error in generate_synthetic_data: {e}")
        return f"Error: {e}"

def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
    data_frames = []
    num_iterations = num_rows // rows_per_generation

    for _ in range(num_iterations):
        generated_data = generate_synthetic_data(description, columns)
        if "Error" in generated_data:
            return generated_data
        df_synthetic = process_generated_data(generated_data)
        data_frames.append(df_synthetic)
    
    return pd.concat(data_frames, ignore_index=True)

def process_generated_data(csv_data):
    data = StringIO(csv_data)
    df = pd.read_csv(data)
    return df

def main(description, columns):
    description = description.strip()
    columns = [col.strip() for col in columns.split(',')]
    df_synthetic = generate_large_synthetic_data(description, columns)
    if isinstance(df_synthetic, str) and "Error" in df_synthetic:
        return df_synthetic  # Return the error message if any
    return df_synthetic.to_csv(index=False)

iface = gr.Interface(
    fn=main,
    inputs=[
        gr.Textbox(label="Description", placeholder="e.g., Generate a dataset for predicting students' grades"),
        gr.Textbox(label="Columns (comma-separated)", placeholder="e.g., name, age, course, grade")
    ],
    outputs="text",
    title="Synthetic Data Generator",
    description="Generate synthetic tabular datasets based on a description and specified columns."
)

iface.api_name = "generate"

# Run the Gradio app
iface.launch(server_name="0.0.0.0", server_port=7860)