Spaces:

yakine
/

Dataset_Generator

Sleeping

App Files Files Community

yakine commited on Aug 9, 2024

Commit

9e25bdd

verified ·

1 Parent(s): e1129df

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -25

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import gradio as gr
 import pandas as pd
 import requests
-from transformers import GPT2LMHeadModel, GPT2Tokenizer, LlamaTokenizer, LlamaForCausalLM, pipeline
-from huggingface_hub import HfFolder, login
 from io import StringIO
 import os
 from flask import Flask, request, jsonify
-from huggingface_hub import HfFolder
 # Set environment variable to avoid floating-point errors
 os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
@@ -18,7 +17,7 @@ model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
 # Create a pipeline for text generation using GPT-2
 text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer)
-#Loading LLama3.1 tokenizer
 try:
     tokenizer_llama = LlamaTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
 except OSError as e:
@@ -27,26 +26,19 @@ except OSError as e:
 # Define your prompt template
 prompt_template = """\
 You are an expert in generating synthetic data for machine learning models.
 Your task is to generate a synthetic tabular dataset based on the description provided below.
 Description: {description}
 The dataset should include the following columns: {columns}
 Please provide the data in CSV format with a minimum of 100 rows per generation.
 Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
 Example Description:
 Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
 Example Output:
 Size,Location,Number of Bedrooms,Price
 1200,Suburban,3,250000
 900,Urban,2,200000
 1500,Rural,4,300000
 ...
 Description:
 {description}
 Columns:
@@ -64,7 +56,6 @@ def format_prompt(description, columns):
 API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B"
 generation_params = {
     "top_p": 0.90,
     "temperature": 0.8,
@@ -74,20 +65,24 @@ generation_params = {
 }
 def generate_synthetic_data(description, columns):
-    formatted_prompt = format_prompt(description, columns)
-    payload = {"inputs": formatted_prompt, "parameters": generation_params}
-    headers = {"Authorization": f"Bearer {HfFolder.get_token()}"}
-    response = requests.post(API_URL, headers=headers, json=payload)
-    if response.status_code == 200:
-        response_json = response.json()
-        if isinstance(response_json, list) and len(response_json) > 0 and "generated_text" in response_json[0]:
-            return response_json[0]["generated_text"]
         else:
-            raise ValueError("Unexpected response format or missing 'generated_text' key")
-    else:
-        print(f"Error details: {response.text}")
-        raise ValueError(f"API request failed with status code {response.status_code}: {response.text}")
 def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
     data_frames = []
@@ -95,6 +90,8 @@ def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_
     for _ in range(num_iterations):
         generated_data = generate_synthetic_data(description, columns)
         df_synthetic = process_generated_data(generated_data)
         data_frames.append(df_synthetic)
@@ -109,6 +106,8 @@ def main(description, columns):
     description = description.strip()
     columns = [col.strip() for col in columns.split(',')]
     df_synthetic = generate_large_synthetic_data(description, columns)
     return df_synthetic.to_csv(index=False)
 # Gradio interface

 import gradio as gr
 import pandas as pd
 import requests
+from transformers import GPT2LMHeadModel, GPT2Tokenizer, LlamaTokenizer, pipeline
+from huggingface_hub import HfFolder
 from io import StringIO
 import os
 from flask import Flask, request, jsonify
 # Set environment variable to avoid floating-point errors
 os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 # Create a pipeline for text generation using GPT-2
 text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer)
+# Loading LLama3.1 tokenizer
 try:
     tokenizer_llama = LlamaTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
 except OSError as e:
 # Define your prompt template
 prompt_template = """\
 You are an expert in generating synthetic data for machine learning models.
 Your task is to generate a synthetic tabular dataset based on the description provided below.
 Description: {description}
 The dataset should include the following columns: {columns}
 Please provide the data in CSV format with a minimum of 100 rows per generation.
 Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
 Example Description:
 Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
 Example Output:
 Size,Location,Number of Bedrooms,Price
 1200,Suburban,3,250000
 900,Urban,2,200000
 1500,Rural,4,300000
 ...
 Description:
 {description}
 Columns:
 API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B"
 generation_params = {
     "top_p": 0.90,
     "temperature": 0.8,
 }
 def generate_synthetic_data(description, columns):
+    try:
+        formatted_prompt = format_prompt(description, columns)
+        payload = {"inputs": formatted_prompt, "parameters": generation_params}
+        headers = {"Authorization": f"Bearer {HfFolder.get_token()}"}
+        response = requests.post(API_URL, headers=headers, json=payload)
+        if response.status_code == 200:
+            response_json = response.json()
+            if isinstance(response_json, list) and len(response_json) > 0 and "generated_text" in response_json[0]:
+                return response_json[0]["generated_text"]
+            else:
+                raise ValueError("Unexpected response format or missing 'generated_text' key")
         else:
+            print(f"Error details: {response.text}")
+            raise ValueError(f"API request failed with status code {response.status_code}: {response.text}")
+    except Exception as e:
+        print(f"Error in generate_synthetic_data: {e}")
+        return f"Error: {e}"
 def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
     data_frames = []
     for _ in range(num_iterations):
         generated_data = generate_synthetic_data(description, columns)
+        if "Error" in generated_data:
+            return generated_data
         df_synthetic = process_generated_data(generated_data)
         data_frames.append(df_synthetic)
     description = description.strip()
     columns = [col.strip() for col in columns.split(',')]
     df_synthetic = generate_large_synthetic_data(description, columns)
+    if isinstance(df_synthetic, str) and "Error" in df_synthetic:
+        return df_synthetic  # Return the error message if any
     return df_synthetic.to_csv(index=False)
 # Gradio interface