File size: 7,039 Bytes
e365a68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9df750
e365a68
 
 
 
 
 
 
 
 
 
 
 
68aebec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import gradio as gr
import threading
from src.synth_data_gen import SynthDataGen

generator = SynthDataGen()

# Update the output format choices based on the selected dataset type
def update_output_format(dataset_type):
    if dataset_type in ["Tabular", "Time-series"]:
        return gr.update(choices=["JSON", "csv", "Parquet"], value="JSON")
    elif dataset_type == "Text":
        return gr.update(choices=["JSON", "Markdown"], value="JSON")

def update_pipeline(business_problem, dataset_type, output_format, num_samples, model):
    # Check if business problem is empty
    if not business_problem.strip():
        yield [gr.update(visible=False), gr.update(visible=True), "❌ Please enter a business problem before generating."]
        return

    # Initial feedback while generating
    yield [gr.update(visible=False), gr.update(visible=False), "⏳ Generating dataset..."]

    try:
        # Pack inputs into a dictionary for the generator
        input_data = {
            "business_problem": business_problem,
            "dataset_type": dataset_type,
            "output_format": output_format,
            "num_samples": num_samples,
            "model": model
        }

        # Generate dataset file
        file_path = generator.generate_dataset(**input_data)
        print("🧪 File result returned:", file_path)

        # Check if file exists and return success message + file path
        if isinstance(file_path, str) and os.path.exists(file_path):
            threading.Timer(60, os.remove, args=[file_path]).start()  # Auto-delete after 60s
            yield [gr.update(value=file_path, visible=True), gr.update(visible=True), "✅ Dataset ready for download."]
        else:
            # Handle invalid or missing file
            yield [gr.update(visible=False), gr.update(visible=True), "❌ Error: File not created or path invalid."]
    
    except Exception as e:
        # Catch and display any errors in the pipeline
        yield [gr.update(visible=False), gr.update(visible=True), f"❌ Pipeline error: {e}"]

def build_ui(css_path="assets/styles.css"):
    with open(css_path, "r") as f:
        css = f.read()

    with gr.Blocks(css=css, title="🧬SynthDataGen") as ui:
        with gr.Column(elem_id="app-container"):
            gr.Markdown("<h1 id='app-title'>SynthDataGen 🧬 </h1>")
            gr.Markdown("<h2 id='app-subtitle'>AI-Powered Synthetic Dataset Generator</h2>")

            gr.HTML("""
            <div id="intro-text">
                <p>With SynthDataGen, easily generate <strong>diverse datasets in different formats</strong> for testing, development, and AI training.</p>
                <h4>🎯 How It Works:</h4>
                <ol>
                <li>1️⃣ Define your business problem or dataset topic.</li>
                <li>2️⃣ Select the dataset type, output format, model, and number of samples.</li>
                <li>3️⃣ Receive your synthetic dataset — ready to download and use!</li>
                </ol>
            </div>
            """)

            gr.HTML("""
                <div id="learn-more-button">
                    <a href="https://github.com/lisek75/synthdatagen_app/blob/main/README.md" class="button-link" target="_blank">Learn More</a>
                </div>
                """)

            gr.Markdown("""
                <p><strong>🧠 Need inspiration?</strong> Try one of these examples:</p>
                <ul>
                <li>Movie summaries for genre classification.</li>
                <li>Generate customer chats with realistic dialogue, chat_id, timestamp, names, sentiment label, and aligned transcript.</li>
                <li>Create daily stock prices for 2 companies with typical fields like date, ticker, open, close, high, low, and volume.</li>
                </ul>
                """)

            gr.Markdown("<p><strong>Start generating your synthetic datasets now!</strong> 🗂️✨</p>")

            with gr.Group(elem_id="input-container"):

                business_problem = gr.Textbox(
                    placeholder="Describe the dataset you want (e.g., Job postings, Customer reviews, Sensor data, Movie titles)",
                    lines=2,
                    label="📌 Business Problem",
                    elem_classes=["label-box"],
                    elem_id="business-problem-box"
                )

                with gr.Row(elem_classes="column-gap"):
                    with gr.Column(scale=1):
                        dataset_type = gr.Dropdown(
                            ["Tabular", "Time-series", "Text"],
                            value="Tabular",
                            label="📊 Dataset Type",
                            elem_classes=["label-box"],
                            elem_id="custom-dropdown"
                        )

                    with gr.Column(scale=1):
                        output_format = gr.Dropdown(
                            choices=["JSON", "csv", "Parquet"], 
                            value="JSON",
                            label="📁 Output Format",
                            elem_classes=["label-box"],
                            elem_id="custom-dropdown"
                        )

                    # Bind the update function to the dataset type dropdown
                    dataset_type.change(
                        update_output_format,
                        inputs=[dataset_type],
                        outputs=[output_format]
                    )

                with gr.Row(elem_classes="row-spacer column-gap"):
                    with gr.Column(scale=1):
                        model = gr.Dropdown(
                            ["GPT", "Claude"],
                            value="GPT",
                            label="🤖 Model",
                            elem_classes=["label-box"],
                            elem_id="custom-dropdown"
                        )

                    with gr.Column(scale=1):
                        num_samples = gr.Slider(
                            minimum=10,
                            maximum=1000,
                            value=10,
                            step=1,
                            interactive=True,
                            label="🔢 Number of Samples",
                            elem_classes=["label-box"]
                        )

            # Hidden file component for dataset download
            file_download = gr.File(visible=False, elem_id="download-box", label=None)

            # Component to display status messages
            status_message = gr.Markdown("", label="Status")

            # Button to trigger dataset generation
            run_btn = gr.Button("Create a dataset", elem_id="run-btn")
            run_btn.click(
                update_pipeline,
                inputs=[business_problem, dataset_type, output_format, num_samples, model],
                outputs=[file_download, run_btn, status_message]
            )

    return ui, generator.output_dir