Spaces:

yakine
/

best

Sleeping

App Files Files Community

yakine commited on Aug 12, 2024

Commit

abf2c45

verified ·

1 Parent(s): 13454c6

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -26

app.py CHANGED Viewed

@@ -6,12 +6,7 @@ import os
 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, pipeline
 from io import StringIO
-from tqdm import tqdm
-import accelerate
-from accelerate import init_empty_weights, disk_offload
 from fastapi.middleware.cors import CORSMiddleware
-import re
 app = FastAPI()
@@ -46,7 +41,7 @@ model_llama = AutoModelForCausalLM.from_pretrained(
     token=hf_token
 )
-# Define your prompt template with a check for the purpose of dataset generation
 prompt_template = """\
 You are an expert in generating synthetic data for machine learning models.
 Your task is to generate a synthetic tabular dataset based on the description provided below.
@@ -54,7 +49,6 @@ Description: {description}
 The dataset should include the following columns: {columns}
 Please provide the data in CSV format with a minimum of 100 rows per generation.
 Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
-If the description is not related to generating synthetic datasets, please return the message: "This request is not for generating synthetic datasets."
 Example Description:
 Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
 Example Output:
@@ -113,24 +107,10 @@ def generate_synthetic_data(description, columns):
         return generated_text
     except Exception as e:
         return f"Error: {e}"
-def clean_generated_text(generated_text):
-    # Extract CSV part using a regular expression
-    csv_match = re.search(r'(\n?([A-Za-z0-9_]+,)*[A-Za-z0-9_]+\n([^\n,]*,)*[^\n,]*\n*)+', generated_text)
-    if csv_match:
-        csv_text = csv_match.group(0)
-    else:
-        raise ValueError("No valid CSV data found in generated text.")
-    return csv_text
 def process_generated_data(csv_data):
-    # Clean the generated data
-    cleaned_data = clean_generated_text(csv_data)
     # Convert to DataFrame
-    data = StringIO(cleaned_data)
     df = pd.read_csv(data)
     return df
@@ -144,10 +124,6 @@ def generate_data(request: DataGenerationRequest):
     if "Error" in generated_data:
         return JSONResponse(content={"error": generated_data}, status_code=500)
-    # Check if the model generated a message about incorrect purpose
-    if "This request is not for generating synthetic datasets." in generated_data:
-        return JSONResponse(content={"message": "This request is not for generating synthetic datasets."}, status_code=400)
     # Process the generated CSV data into a DataFrame
     df_synthetic = process_generated_data(generated_data)
     return JSONResponse(content={"data": df_synthetic.to_dict(orient="records")})

 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, pipeline
 from io import StringIO
 from fastapi.middleware.cors import CORSMiddleware
 app = FastAPI()
     token=hf_token
 )
+# Define your prompt template
 prompt_template = """\
 You are an expert in generating synthetic data for machine learning models.
 Your task is to generate a synthetic tabular dataset based on the description provided below.
 The dataset should include the following columns: {columns}
 Please provide the data in CSV format with a minimum of 100 rows per generation.
 Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
 Example Description:
 Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
 Example Output:
         return generated_text
     except Exception as e:
         return f"Error: {e}"
 def process_generated_data(csv_data):
     # Convert to DataFrame
+    data = StringIO(csv_data)
     df = pd.read_csv(data)
     return df
     if "Error" in generated_data:
         return JSONResponse(content={"error": generated_data}, status_code=500)
     # Process the generated CSV data into a DataFrame
     df_synthetic = process_generated_data(generated_data)
     return JSONResponse(content={"data": df_synthetic.to_dict(orient="records")})