yakine commited on
Commit
a93106f
·
verified ·
1 Parent(s): bad46c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -19
app.py CHANGED
@@ -1,10 +1,10 @@
1
  from fastapi import FastAPI
2
- from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
4
  import pandas as pd
5
  import os
6
  import requests
7
- from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, pipeline
8
  from io import StringIO
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from huggingface_hub import HfFolder
@@ -41,25 +41,18 @@ def preprocess_user_prompt(user_prompt):
41
  # Define prompt template
42
  prompt_template = """\
43
  You are an expert in generating synthetic data for machine learning models.
44
-
45
  Your task is to generate a synthetic tabular dataset based on the description provided below.
46
-
47
  Description: {description}
48
-
49
  The dataset should include the following columns: {columns}
50
-
51
  Please provide the data in CSV format.
52
-
53
  Example Description:
54
  Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
55
-
56
  Example Output:
57
  Size,Location,Number of Bedrooms,Price
58
  1200,Suburban,3,250000
59
  900,Urban,2,200000
60
  1500,Rural,4,300000
61
  ...
62
-
63
  Description:
64
  {description}
65
  Columns:
@@ -120,7 +113,7 @@ def process_generated_data(csv_data, expected_columns):
120
  return f"Failed to parse CSV data: {e}"
121
 
122
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
123
- data_frames = []
124
 
125
  for _ in tqdm(range(num_rows // rows_per_generation), desc="Generating Data"):
126
  generated_data = generate_synthetic_data(description, columns)
@@ -129,12 +122,12 @@ def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_
129
 
130
  df_synthetic = process_generated_data(generated_data, columns)
131
  if isinstance(df_synthetic, pd.DataFrame) and not df_synthetic.empty:
132
- data_frames.append(df_synthetic)
133
  else:
134
  print("Skipping invalid generation.")
135
 
136
- if data_frames:
137
- return pd.concat(data_frames, ignore_index=True)
138
  else:
139
  return "No valid data frames to concatenate."
140
 
@@ -147,12 +140,13 @@ def generate_data(request: DataGenerationRequest):
147
  if isinstance(generated_data, str) and "Error" in generated_data:
148
  return JSONResponse(content={"error": generated_data}, status_code=500)
149
 
150
- # Process the generated CSV data into a DataFrame
151
- df_synthetic = process_generated_data(generated_data, columns)
152
- if isinstance(df_synthetic, pd.DataFrame):
153
- return JSONResponse(content={"data": df_synthetic.to_dict(orient="records")})
154
- else:
155
- return JSONResponse(content={"error": "Failed to generate valid synthetic data"}, status_code=500)
 
156
 
157
  @app.get("/")
158
  def greet_json():
 
1
  from fastapi import FastAPI
2
+ from fastapi.responses import StreamingResponse, JSONResponse
3
  from pydantic import BaseModel
4
  import pandas as pd
5
  import os
6
  import requests
7
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer
8
  from io import StringIO
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from huggingface_hub import HfFolder
 
41
  # Define prompt template
42
  prompt_template = """\
43
  You are an expert in generating synthetic data for machine learning models.
 
44
  Your task is to generate a synthetic tabular dataset based on the description provided below.
 
45
  Description: {description}
 
46
  The dataset should include the following columns: {columns}
 
47
  Please provide the data in CSV format.
 
48
  Example Description:
49
  Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
 
50
  Example Output:
51
  Size,Location,Number of Bedrooms,Price
52
  1200,Suburban,3,250000
53
  900,Urban,2,200000
54
  1500,Rural,4,300000
55
  ...
 
56
  Description:
57
  {description}
58
  Columns:
 
113
  return f"Failed to parse CSV data: {e}"
114
 
115
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
116
+ csv_data_all = ""
117
 
118
  for _ in tqdm(range(num_rows // rows_per_generation), desc="Generating Data"):
119
  generated_data = generate_synthetic_data(description, columns)
 
122
 
123
  df_synthetic = process_generated_data(generated_data, columns)
124
  if isinstance(df_synthetic, pd.DataFrame) and not df_synthetic.empty:
125
+ csv_data_all += df_synthetic.to_csv(index=False, header=False)
126
  else:
127
  print("Skipping invalid generation.")
128
 
129
+ if csv_data_all:
130
+ return csv_data_all
131
  else:
132
  return "No valid data frames to concatenate."
133
 
 
140
  if isinstance(generated_data, str) and "Error" in generated_data:
141
  return JSONResponse(content={"error": generated_data}, status_code=500)
142
 
143
+ # Create a streaming response to return the CSV data
144
+ csv_buffer = StringIO(generated_data)
145
+ return StreamingResponse(
146
+ csv_buffer,
147
+ media_type="text/csv",
148
+ headers={"Content-Disposition": "attachment; filename=generated_data.csv"}
149
+ )
150
 
151
  @app.get("/")
152
  def greet_json():