File size: 5,357 Bytes
d03c004 83947eb bbce957 84eea9f bf2f303 f3e43ed 391774c bf2f303 bbce957 a421e2f 15e9cb3 bbce957 15e9cb3 bbce957 15e9cb3 bbce957 f3e43ed bbce957 f3e43ed bbce957 15e9cb3 bf2f303 bbce957 83947eb bf2f303 bbce957 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
pip install --upgrade transformers
import streamlit as st
import pandas as pd
import os
import torch
from transformers import GPT2Model, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import HfFolder
from io import StringIO
# Access the Hugging Face API token from environment variables
hf_token = os.getenv('HF_API_TOKEN')
if not hf_token:
raise ValueError("Hugging Face API token is not set. Please set the HF_API_TOKEN environment variable.")
HfFolder.save_token(hf_token)
# Set environment variable to avoid floating-point errors
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt2 = GPT2Model.from_pretrained('gpt2')
# Create a pipeline for text generation using GPT-2
text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer)
# Lazy loading function for Llama-3 model
model_llama = None
tokenizer_llama = None
def load_llama_model():
global model_llama, tokenizer_llama
if model_llama is None:
model_name = "meta-llama/Meta-Llama-3.1-8B"
model_llama = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # Use FP16 for reduced memory
token=hf_token
)
tokenizer_llama = AutoTokenizer.from_pretrained(model_name, token=hf_token)
# Define your prompt template
prompt_template = """\
You are an expert in generating synthetic data for machine learning models.
Your task is to generate a synthetic tabular dataset based on the description provided below.
Description: {description}
The dataset should include the following columns: {columns}
Please provide the data in CSV format with a minimum of 100 rows per generation.
Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
Example Description:
Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
Example Output:
Size,Location,Number of Bedrooms,Price
1200,Suburban,3,250000
900,Urban,2,200000
1500,Rural,4,300000
...
Description:
{description}
Columns:
{columns}
Output: """
def preprocess_user_prompt(user_prompt):
generated_text = text_generator(user_prompt, max_length=60, num_return_sequences=1)[0]["generated_text"]
return generated_text
def format_prompt(description, columns):
processed_description = preprocess_user_prompt(description)
prompt = prompt_template.format(description=processed_description, columns=",".join(columns))
return prompt
generation_params = {
"top_p": 0.90,
"temperature": 0.8,
"max_new_tokens": 512,
"return_full_text": False,
"use_cache": False
}
def generate_synthetic_data(description, columns):
try:
# Load the Llama model only when generating data
load_llama_model()
# Prepare the input for the Llama model
formatted_prompt = format_prompt(description, columns)
# Tokenize the prompt
inputs = tokenizer_llama(formatted_prompt, return_tensors="pt").to(model_llama.device)
# Generate synthetic data
with torch.no_grad():
outputs = model_llama.generate(
**inputs,
max_length=512,
top_p=generation_params["top_p"],
temperature=generation_params["temperature"],
num_return_sequences=1
)
# Decode the generated output
generated_text = tokenizer_llama.decode(outputs[0], skip_special_tokens=True)
# Return the generated synthetic data
return generated_text
except Exception as e:
print(f"Error in generate_synthetic_data: {e}")
return f"Error: {e}"
def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
data_frames = []
num_iterations = num_rows // rows_per_generation
for _ in range(num_iterations):
generated_data = generate_synthetic_data(description, columns)
if "Error" in generated_data:
return generated_data
df_synthetic = process_generated_data(generated_data)
data_frames.append(df_synthetic)
return pd.concat(data_frames, ignore_index=True)
def process_generated_data(csv_data):
data = StringIO(csv_data)
df = pd.read_csv(data)
return df
# Streamlit app interface
st.title("Synthetic Data Generator")
description = st.text_input("Description", "e.g., Generate a dataset for predicting students' grades")
columns = st.text_input("Columns (comma-separated)", "e.g., name, age, course, grade")
if st.button("Generate"):
description = description.strip()
columns = [col.strip() for col in columns.split(',')]
df_synthetic = generate_large_synthetic_data(description, columns)
if isinstance(df_synthetic, str) and "Error" in df_synthetic:
st.error(df_synthetic) # Display error message if any
else:
st.success("Synthetic Data Generated!")
st.dataframe(df_synthetic) # Display the generated DataFrame
st.download_button(
label="Download CSV",
data=df_synthetic.to_csv(index=False),
file_name="synthetic_data.csv",
mime="text/csv"
)
|