yakine commited on
Commit
eaf3f9a
·
verified ·
1 Parent(s): c549aa3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -29
app.py CHANGED
@@ -1,15 +1,14 @@
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.responses import StreamingResponse, JSONResponse
3
  from pydantic import BaseModel
 
4
  import os
5
  import requests
6
  from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, pipeline
7
  from io import StringIO
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from huggingface_hub import HfFolder
10
- import logging
11
- import random
12
- import csv
13
 
14
  app = FastAPI()
15
 
@@ -89,36 +88,47 @@ def generate_synthetic_data(description, columns):
89
  else:
90
  raise ValueError("Invalid response format from Hugging Face API.")
91
  except (requests.RequestException, ValueError) as e:
92
- logging.error(f"Error during API request or response processing: {e}")
93
- return "name,age,course,grade\nSampleName,20,Course,0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
96
- csv_buffer = StringIO()
97
- writer = csv.writer(csv_buffer)
98
-
99
- # Write header
100
- writer.writerow(columns)
101
 
102
- rows_generated = 0
103
- while rows_generated < num_rows:
104
  generated_data = generate_synthetic_data(description, columns)
105
- cleaned_data = generated_data.replace('\r\n', '\n').replace('\r', '\n')
106
- data = StringIO(cleaned_data)
107
-
108
- # Append rows to CSV buffer
109
- reader = csv.reader(data)
110
- header_written = False
111
- for row in reader:
112
- if not header_written:
113
- header_written = True
114
- continue # Skip the header of the generated data
115
- writer.writerow(row)
116
- rows_generated += 1
117
- if rows_generated >= num_rows:
118
- break
119
 
120
- csv_buffer.seek(0)
121
- return csv_buffer
 
 
 
122
 
123
  class DataGenerationRequest(BaseModel):
124
  description: str
@@ -128,8 +138,16 @@ class DataGenerationRequest(BaseModel):
128
  def generate_data(request: DataGenerationRequest):
129
  description = request.description.strip()
130
  columns = [col.strip() for col in request.columns]
131
- csv_buffer = generate_large_synthetic_data(description, columns, num_rows=10000)
132
 
 
 
 
 
 
 
 
 
133
  # Return the CSV data as a downloadable file
134
  return StreamingResponse(
135
  csv_buffer,
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.responses import StreamingResponse, JSONResponse
3
  from pydantic import BaseModel
4
+ import pandas as pd
5
  import os
6
  import requests
7
  from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, pipeline
8
  from io import StringIO
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from huggingface_hub import HfFolder
11
+ from tqdm import tqdm
 
 
12
 
13
  app = FastAPI()
14
 
 
88
  else:
89
  raise ValueError("Invalid response format from Hugging Face API.")
90
  except (requests.RequestException, ValueError) as e:
91
+ print(f"Error during API request or response processing: {e}")
92
+ return ""
93
+
94
+ def process_generated_data(csv_data, expected_columns):
95
+ try:
96
+ # Ensure the data is cleaned and correctly formatted
97
+ cleaned_data = csv_data.replace('\r\n', '\n').replace('\r', '\n')
98
+ data = StringIO(cleaned_data)
99
+
100
+ # Read the CSV data
101
+ df = pd.read_csv(data, delimiter=',')
102
+
103
+ # Check if the DataFrame has the expected columns
104
+ if set(df.columns) != set(expected_columns):
105
+ print(f"Unexpected columns in the generated data: {df.columns}")
106
+ return None
107
+
108
+ return df
109
+ except pd.errors.ParserError as e:
110
+ print(f"Failed to parse CSV data: {e}")
111
+ return None
112
 
113
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
114
+ data_frames = []
 
 
 
 
115
 
116
+ for _ in tqdm(range(num_rows // rows_per_generation), desc="Generating Data"):
 
117
  generated_data = generate_synthetic_data(description, columns)
118
+ if generated_data:
119
+ df_synthetic = process_generated_data(generated_data, columns)
120
+ if df_synthetic is not None and not df_synthetic.empty:
121
+ data_frames.append(df_synthetic)
122
+ else:
123
+ print("Skipping invalid generation.")
124
+ else:
125
+ print("Skipping empty or invalid generation.")
 
 
 
 
 
 
126
 
127
+ if data_frames:
128
+ return pd.concat(data_frames, ignore_index=True)
129
+ else:
130
+ print("No valid data frames to concatenate.")
131
+ return pd.DataFrame(columns=columns)
132
 
133
  class DataGenerationRequest(BaseModel):
134
  description: str
 
138
  def generate_data(request: DataGenerationRequest):
139
  description = request.description.strip()
140
  columns = [col.strip() for col in request.columns]
141
+ csv_data = generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100)
142
 
143
+ if csv_data.empty:
144
+ return JSONResponse(content={"error": "No valid data generated"}, status_code=500)
145
+
146
+ # Convert the DataFrame to CSV format
147
+ csv_buffer = StringIO()
148
+ csv_data.to_csv(csv_buffer, index=False)
149
+ csv_buffer.seek(0)
150
+
151
  # Return the CSV data as a downloadable file
152
  return StreamingResponse(
153
  csv_buffer,