Spaces:

yakine
/

best

Sleeping

App Files Files Community

yakine commited on Aug 13, 2024

Commit

857e937

verified ·

1 Parent(s): 31d4200

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -16

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from fastapi import FastAPI, HTTPException
-from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 import pandas as pd
 import os
@@ -41,32 +41,24 @@ def preprocess_user_prompt(user_prompt):
 # Define prompt template
 prompt_template = """\
 You are an expert in generating synthetic data for machine learning models.
 Your task is to generate a synthetic tabular dataset based on the description provided below.
 Description: {description}
 The dataset should include the following columns: {columns}
 Please provide the data in CSV format.
 Example Description:
 Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
 Example Output:
 Size,Location,Number of Bedrooms,Price
 1200,Suburban,3,250000
 900,Urban,2,200000
 1500,Rural,4,300000
 ...
 Description:
 {description}
 Columns:
 {columns}
 Output: """
 tokenizer_mixtral = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
 def format_prompt(description, columns):
@@ -87,8 +79,17 @@ generation_params = {
 def generate_synthetic_data(description, columns):
     formatted_prompt = format_prompt(description, columns)
     payload = {"inputs": formatted_prompt, "parameters": generation_params}
-    response = requests.post(API_URL, headers={"Authorization": f"Bearer {hf_token}"}, json=payload)
-    return response.json()[0]["generated_text"]
 def process_generated_data(csv_data, expected_columns):
     try:
@@ -114,12 +115,14 @@ def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_
     for _ in tqdm(range(num_rows // rows_per_generation), desc="Generating Data"):
         generated_data = generate_synthetic_data(description, columns)
-        df_synthetic = process_generated_data(generated_data, columns)
-        if df_synthetic is not None and not df_synthetic.empty:
-            data_frames.append(df_synthetic)
         else:
-            print("Skipping invalid generation.")
     if data_frames:
         return pd.concat(data_frames, ignore_index=True)

 from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel
 import pandas as pd
 import os
 # Define prompt template
 prompt_template = """\
 You are an expert in generating synthetic data for machine learning models.
 Your task is to generate a synthetic tabular dataset based on the description provided below.
 Description: {description}
 The dataset should include the following columns: {columns}
 Please provide the data in CSV format.
 Example Description:
 Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
 Example Output:
 Size,Location,Number of Bedrooms,Price
 1200,Suburban,3,250000
 900,Urban,2,200000
 1500,Rural,4,300000
 ...
 Description:
 {description}
 Columns:
 {columns}
 Output: """
 tokenizer_mixtral = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
 def format_prompt(description, columns):
 def generate_synthetic_data(description, columns):
     formatted_prompt = format_prompt(description, columns)
     payload = {"inputs": formatted_prompt, "parameters": generation_params}
+    try:
+        response = requests.post(API_URL, headers={"Authorization": f"Bearer {hf_token}"}, json=payload)
+        response.raise_for_status()
+        data = response.json()
+        if 'generated_text' in data[0]:
+            return data[0]['generated_text']
+        else:
+            raise ValueError("Invalid response format from Hugging Face API.")
+    except (requests.RequestException, ValueError) as e:
+        print(f"Error during API request or response processing: {e}")
+        return ""
 def process_generated_data(csv_data, expected_columns):
     try:
     for _ in tqdm(range(num_rows // rows_per_generation), desc="Generating Data"):
         generated_data = generate_synthetic_data(description, columns)
+        if generated_data:
+            df_synthetic = process_generated_data(generated_data, columns)
+            if df_synthetic is not None and not df_synthetic.empty:
+                data_frames.append(df_synthetic)
+            else:
+                print("Skipping invalid generation.")
         else:
+            print("Skipping empty or invalid generation.")
     if data_frames:
         return pd.concat(data_frames, ignore_index=True)