File size: 4,846 Bytes
91b6f17
 
 
 
 
 
f6e79d6
 
5c308f4
 
927dbb4
 
 
 
 
 
 
 
 
 
 
 
 
33d0d12
927dbb4
24cb88f
 
 
 
 
 
91b6f17
 
 
24cb88f
91b6f17
c000410
 
 
 
 
91b6f17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8567ebc
91b6f17
 
 
 
 
 
 
 
 
5c308f4
91b6f17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c308f4
91b6f17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import gradio as gr
import pandas as pd
import requests
from transformers import GPT2LMHeadModel, GPT2Tokenizer, LlamaTokenizer, LlamaForCausalLM, pipeline
from huggingface_hub import HfFolder, login
from io import StringIO
import os
from flask import Flask, request, jsonify
from huggingface_hub import HfFolder

import os
from huggingface_hub import login, HfFolder

# Load the Hugging Face token from an environment variable
token = os.getenv('HUGGINGFACE_TOKEN')  # Ensure this is set correctly

if token is None:
    print("Error: HUGGINGFACE_TOKEN is not set.")
else:
    print("HUGGINGFACE_TOKEN loaded successfully.")

# Authenticate with Hugging Face
login(token=token)
HfFolder.save_token(token)


# Set environment variable to avoid floating-point errors
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

# Create a pipeline for text generation using GPT-2
text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer)

#Loading LLama3.1 tokenizer
try:
    tokenizer_llama = LlamaTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
except OSError as e:
    print(f"Error loading tokenizer: {e}")

# Define your prompt template
prompt_template = """\
You are an expert in generating synthetic data for machine learning models.

Your task is to generate a synthetic tabular dataset based on the description provided below.

Description: {description}

The dataset should include the following columns: {columns}

Please provide the data in CSV format with a minimum of 100 rows per generation.
Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.

Example Description:
Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'

Example Output:
Size,Location,Number of Bedrooms,Price
1200,Suburban,3,250000
900,Urban,2,200000
1500,Rural,4,300000
...

Description:
{description}
Columns:
{columns}
Output: """

def preprocess_user_prompt(user_prompt):
    generated_text = text_generator(user_prompt, max_length=60, num_return_sequences=1)[0]["generated_text"]
    return generated_text

def format_prompt(description, columns):
    processed_description = preprocess_user_prompt(description)
    prompt = prompt_template.format(description=processed_description, columns=",".join(columns))
    return prompt

API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B"


generation_params = {
    "top_p": 0.90,
    "temperature": 0.8,
    "max_new_tokens": 512,
    "return_full_text": False,
    "use_cache": False
}

def generate_synthetic_data(description, columns):
    formatted_prompt = format_prompt(description, columns)
    payload = {"inputs": formatted_prompt, "parameters": generation_params}
    headers = {"Authorization": f"Bearer {HfFolder.get_token()}"}
    response = requests.post(API_URL, headers=headers, json=payload)
    
    if response.status_code == 200:
        response_json = response.json()
        if isinstance(response_json, list) and len(response_json) > 0 and "generated_text" in response_json[0]:
            return response_json[0]["generated_text"]
        else:
            raise ValueError("Unexpected response format or missing 'generated_text' key")
    else:
        print(f"Error details: {response.text}") 
        raise ValueError(f"API request failed with status code {response.status_code}: {response.text}")

def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
    data_frames = []
    num_iterations = num_rows // rows_per_generation

    for _ in range(num_iterations):
        generated_data = generate_synthetic_data(description, columns)
        df_synthetic = process_generated_data(generated_data)
        data_frames.append(df_synthetic)
    
    return pd.concat(data_frames, ignore_index=True)

def process_generated_data(csv_data):
    data = StringIO(csv_data)
    df = pd.read_csv(data)
    return df

def main(description, columns):
    description = description.strip()
    columns = [col.strip() for col in columns.split(',')]
    df_synthetic = generate_large_synthetic_data(description, columns)
    return df_synthetic.to_csv(index=False)

# Gradio interface
iface = gr.Interface(
    fn=main,
    inputs=[
        gr.Textbox(label="Description", placeholder="e.g., Generate a dataset for predicting students' grades"),
        gr.Textbox(label="Columns (comma-separated)", placeholder="e.g., name, age, course, grade")
    ],
    outputs="text",
    title="Synthetic Data Generator",
    description="Generate synthetic tabular datasets based on a description and specified columns."
)

# Run the Gradio app
iface.launch()