Spaces:
Running
Running
File size: 7,039 Bytes
e365a68 c9df750 e365a68 68aebec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
import gradio as gr
import threading
from src.synth_data_gen import SynthDataGen
generator = SynthDataGen()
# Update the output format choices based on the selected dataset type
def update_output_format(dataset_type):
if dataset_type in ["Tabular", "Time-series"]:
return gr.update(choices=["JSON", "csv", "Parquet"], value="JSON")
elif dataset_type == "Text":
return gr.update(choices=["JSON", "Markdown"], value="JSON")
def update_pipeline(business_problem, dataset_type, output_format, num_samples, model):
# Check if business problem is empty
if not business_problem.strip():
yield [gr.update(visible=False), gr.update(visible=True), "❌ Please enter a business problem before generating."]
return
# Initial feedback while generating
yield [gr.update(visible=False), gr.update(visible=False), "⏳ Generating dataset..."]
try:
# Pack inputs into a dictionary for the generator
input_data = {
"business_problem": business_problem,
"dataset_type": dataset_type,
"output_format": output_format,
"num_samples": num_samples,
"model": model
}
# Generate dataset file
file_path = generator.generate_dataset(**input_data)
print("🧪 File result returned:", file_path)
# Check if file exists and return success message + file path
if isinstance(file_path, str) and os.path.exists(file_path):
threading.Timer(60, os.remove, args=[file_path]).start() # Auto-delete after 60s
yield [gr.update(value=file_path, visible=True), gr.update(visible=True), "✅ Dataset ready for download."]
else:
# Handle invalid or missing file
yield [gr.update(visible=False), gr.update(visible=True), "❌ Error: File not created or path invalid."]
except Exception as e:
# Catch and display any errors in the pipeline
yield [gr.update(visible=False), gr.update(visible=True), f"❌ Pipeline error: {e}"]
def build_ui(css_path="assets/styles.css"):
with open(css_path, "r") as f:
css = f.read()
with gr.Blocks(css=css, title="🧬SynthDataGen") as ui:
with gr.Column(elem_id="app-container"):
gr.Markdown("<h1 id='app-title'>SynthDataGen 🧬 </h1>")
gr.Markdown("<h2 id='app-subtitle'>AI-Powered Synthetic Dataset Generator</h2>")
gr.HTML("""
<div id="intro-text">
<p>With SynthDataGen, easily generate <strong>diverse datasets in different formats</strong> for testing, development, and AI training.</p>
<h4>🎯 How It Works:</h4>
<ol>
<li>1️⃣ Define your business problem or dataset topic.</li>
<li>2️⃣ Select the dataset type, output format, model, and number of samples.</li>
<li>3️⃣ Receive your synthetic dataset — ready to download and use!</li>
</ol>
</div>
""")
gr.HTML("""
<div id="learn-more-button">
<a href="https://github.com/lisek75/synthdatagen_app/blob/main/README.md" class="button-link" target="_blank">Learn More</a>
</div>
""")
gr.Markdown("""
<p><strong>🧠 Need inspiration?</strong> Try one of these examples:</p>
<ul>
<li>Movie summaries for genre classification.</li>
<li>Generate customer chats with realistic dialogue, chat_id, timestamp, names, sentiment label, and aligned transcript.</li>
<li>Create daily stock prices for 2 companies with typical fields like date, ticker, open, close, high, low, and volume.</li>
</ul>
""")
gr.Markdown("<p><strong>Start generating your synthetic datasets now!</strong> 🗂️✨</p>")
with gr.Group(elem_id="input-container"):
business_problem = gr.Textbox(
placeholder="Describe the dataset you want (e.g., Job postings, Customer reviews, Sensor data, Movie titles)",
lines=2,
label="📌 Business Problem",
elem_classes=["label-box"],
elem_id="business-problem-box"
)
with gr.Row(elem_classes="column-gap"):
with gr.Column(scale=1):
dataset_type = gr.Dropdown(
["Tabular", "Time-series", "Text"],
value="Tabular",
label="📊 Dataset Type",
elem_classes=["label-box"],
elem_id="custom-dropdown"
)
with gr.Column(scale=1):
output_format = gr.Dropdown(
choices=["JSON", "csv", "Parquet"],
value="JSON",
label="📁 Output Format",
elem_classes=["label-box"],
elem_id="custom-dropdown"
)
# Bind the update function to the dataset type dropdown
dataset_type.change(
update_output_format,
inputs=[dataset_type],
outputs=[output_format]
)
with gr.Row(elem_classes="row-spacer column-gap"):
with gr.Column(scale=1):
model = gr.Dropdown(
["GPT", "Claude"],
value="GPT",
label="🤖 Model",
elem_classes=["label-box"],
elem_id="custom-dropdown"
)
with gr.Column(scale=1):
num_samples = gr.Slider(
minimum=10,
maximum=1000,
value=10,
step=1,
interactive=True,
label="🔢 Number of Samples",
elem_classes=["label-box"]
)
# Hidden file component for dataset download
file_download = gr.File(visible=False, elem_id="download-box", label=None)
# Component to display status messages
status_message = gr.Markdown("", label="Status")
# Button to trigger dataset generation
run_btn = gr.Button("Create a dataset", elem_id="run-btn")
run_btn.click(
update_pipeline,
inputs=[business_problem, dataset_type, output_format, num_samples, model],
outputs=[file_download, run_btn, status_message]
)
return ui, generator.output_dir
|