yakine commited on
Commit
c549aa3
·
verified ·
1 Parent(s): 857e937

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -47
app.py CHANGED
@@ -1,14 +1,15 @@
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.responses import StreamingResponse, JSONResponse
3
  from pydantic import BaseModel
4
- import pandas as pd
5
  import os
6
  import requests
7
  from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, pipeline
8
  from io import StringIO
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from huggingface_hub import HfFolder
11
- from tqdm import tqdm
 
 
12
 
13
  app = FastAPI()
14
 
@@ -88,47 +89,36 @@ def generate_synthetic_data(description, columns):
88
  else:
89
  raise ValueError("Invalid response format from Hugging Face API.")
90
  except (requests.RequestException, ValueError) as e:
91
- print(f"Error during API request or response processing: {e}")
92
- return ""
93
-
94
- def process_generated_data(csv_data, expected_columns):
95
- try:
96
- # Ensure the data is cleaned and correctly formatted
97
- cleaned_data = csv_data.replace('\r\n', '\n').replace('\r', '\n')
98
- data = StringIO(cleaned_data)
99
-
100
- # Read the CSV data
101
- df = pd.read_csv(data, delimiter=',')
102
-
103
- # Check if the DataFrame has the expected columns
104
- if set(df.columns) != set(expected_columns):
105
- print(f"Unexpected columns in the generated data: {df.columns}")
106
- return None
107
-
108
- return df
109
- except pd.errors.ParserError as e:
110
- print(f"Failed to parse CSV data: {e}")
111
- return None
112
 
113
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
114
- data_frames = []
 
 
 
 
115
 
116
- for _ in tqdm(range(num_rows // rows_per_generation), desc="Generating Data"):
 
117
  generated_data = generate_synthetic_data(description, columns)
118
- if generated_data:
119
- df_synthetic = process_generated_data(generated_data, columns)
120
- if df_synthetic is not None and not df_synthetic.empty:
121
- data_frames.append(df_synthetic)
122
- else:
123
- print("Skipping invalid generation.")
124
- else:
125
- print("Skipping empty or invalid generation.")
 
 
 
 
 
 
126
 
127
- if data_frames:
128
- return pd.concat(data_frames, ignore_index=True)
129
- else:
130
- print("No valid data frames to concatenate.")
131
- return pd.DataFrame(columns=columns)
132
 
133
  class DataGenerationRequest(BaseModel):
134
  description: str
@@ -138,16 +128,8 @@ class DataGenerationRequest(BaseModel):
138
  def generate_data(request: DataGenerationRequest):
139
  description = request.description.strip()
140
  columns = [col.strip() for col in request.columns]
141
- csv_data = generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100)
142
 
143
- if csv_data.empty:
144
- return JSONResponse(content={"error": "No valid data generated"}, status_code=500)
145
-
146
- # Convert the DataFrame to CSV format
147
- csv_buffer = StringIO()
148
- csv_data.to_csv(csv_buffer, index=False)
149
- csv_buffer.seek(0)
150
-
151
  # Return the CSV data as a downloadable file
152
  return StreamingResponse(
153
  csv_buffer,
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.responses import StreamingResponse, JSONResponse
3
  from pydantic import BaseModel
 
4
  import os
5
  import requests
6
  from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, pipeline
7
  from io import StringIO
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from huggingface_hub import HfFolder
10
+ import logging
11
+ import random
12
+ import csv
13
 
14
  app = FastAPI()
15
 
 
89
  else:
90
  raise ValueError("Invalid response format from Hugging Face API.")
91
  except (requests.RequestException, ValueError) as e:
92
+ logging.error(f"Error during API request or response processing: {e}")
93
+ return "name,age,course,grade\nSampleName,20,Course,0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
96
+ csv_buffer = StringIO()
97
+ writer = csv.writer(csv_buffer)
98
+
99
+ # Write header
100
+ writer.writerow(columns)
101
 
102
+ rows_generated = 0
103
+ while rows_generated < num_rows:
104
  generated_data = generate_synthetic_data(description, columns)
105
+ cleaned_data = generated_data.replace('\r\n', '\n').replace('\r', '\n')
106
+ data = StringIO(cleaned_data)
107
+
108
+ # Append rows to CSV buffer
109
+ reader = csv.reader(data)
110
+ header_written = False
111
+ for row in reader:
112
+ if not header_written:
113
+ header_written = True
114
+ continue # Skip the header of the generated data
115
+ writer.writerow(row)
116
+ rows_generated += 1
117
+ if rows_generated >= num_rows:
118
+ break
119
 
120
+ csv_buffer.seek(0)
121
+ return csv_buffer
 
 
 
122
 
123
  class DataGenerationRequest(BaseModel):
124
  description: str
 
128
  def generate_data(request: DataGenerationRequest):
129
  description = request.description.strip()
130
  columns = [col.strip() for col in request.columns]
131
+ csv_buffer = generate_large_synthetic_data(description, columns, num_rows=10000)
132
 
 
 
 
 
 
 
 
 
133
  # Return the CSV data as a downloadable file
134
  return StreamingResponse(
135
  csv_buffer,