yakine commited on
Commit
9e25bdd
·
verified ·
1 Parent(s): e1129df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -25
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import requests
4
- from transformers import GPT2LMHeadModel, GPT2Tokenizer, LlamaTokenizer, LlamaForCausalLM, pipeline
5
- from huggingface_hub import HfFolder, login
6
  from io import StringIO
7
  import os
8
  from flask import Flask, request, jsonify
9
- from huggingface_hub import HfFolder
10
 
11
  # Set environment variable to avoid floating-point errors
12
  os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
@@ -18,7 +17,7 @@ model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
18
  # Create a pipeline for text generation using GPT-2
19
  text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer)
20
 
21
- #Loading LLama3.1 tokenizer
22
  try:
23
  tokenizer_llama = LlamaTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
24
  except OSError as e:
@@ -27,26 +26,19 @@ except OSError as e:
27
  # Define your prompt template
28
  prompt_template = """\
29
  You are an expert in generating synthetic data for machine learning models.
30
-
31
  Your task is to generate a synthetic tabular dataset based on the description provided below.
32
-
33
  Description: {description}
34
-
35
  The dataset should include the following columns: {columns}
36
-
37
  Please provide the data in CSV format with a minimum of 100 rows per generation.
38
  Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
39
-
40
  Example Description:
41
  Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
42
-
43
  Example Output:
44
  Size,Location,Number of Bedrooms,Price
45
  1200,Suburban,3,250000
46
  900,Urban,2,200000
47
  1500,Rural,4,300000
48
  ...
49
-
50
  Description:
51
  {description}
52
  Columns:
@@ -64,7 +56,6 @@ def format_prompt(description, columns):
64
 
65
  API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B"
66
 
67
-
68
  generation_params = {
69
  "top_p": 0.90,
70
  "temperature": 0.8,
@@ -74,20 +65,24 @@ generation_params = {
74
  }
75
 
76
  def generate_synthetic_data(description, columns):
77
- formatted_prompt = format_prompt(description, columns)
78
- payload = {"inputs": formatted_prompt, "parameters": generation_params}
79
- headers = {"Authorization": f"Bearer {HfFolder.get_token()}"}
80
- response = requests.post(API_URL, headers=headers, json=payload)
81
-
82
- if response.status_code == 200:
83
- response_json = response.json()
84
- if isinstance(response_json, list) and len(response_json) > 0 and "generated_text" in response_json[0]:
85
- return response_json[0]["generated_text"]
 
 
 
86
  else:
87
- raise ValueError("Unexpected response format or missing 'generated_text' key")
88
- else:
89
- print(f"Error details: {response.text}")
90
- raise ValueError(f"API request failed with status code {response.status_code}: {response.text}")
 
91
 
92
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
93
  data_frames = []
@@ -95,6 +90,8 @@ def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_
95
 
96
  for _ in range(num_iterations):
97
  generated_data = generate_synthetic_data(description, columns)
 
 
98
  df_synthetic = process_generated_data(generated_data)
99
  data_frames.append(df_synthetic)
100
 
@@ -109,6 +106,8 @@ def main(description, columns):
109
  description = description.strip()
110
  columns = [col.strip() for col in columns.split(',')]
111
  df_synthetic = generate_large_synthetic_data(description, columns)
 
 
112
  return df_synthetic.to_csv(index=False)
113
 
114
  # Gradio interface
 
1
  import gradio as gr
2
  import pandas as pd
3
  import requests
4
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, LlamaTokenizer, pipeline
5
+ from huggingface_hub import HfFolder
6
  from io import StringIO
7
  import os
8
  from flask import Flask, request, jsonify
 
9
 
10
  # Set environment variable to avoid floating-point errors
11
  os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 
17
  # Create a pipeline for text generation using GPT-2
18
  text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer)
19
 
20
+ # Loading LLama3.1 tokenizer
21
  try:
22
  tokenizer_llama = LlamaTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
23
  except OSError as e:
 
26
  # Define your prompt template
27
  prompt_template = """\
28
  You are an expert in generating synthetic data for machine learning models.
 
29
  Your task is to generate a synthetic tabular dataset based on the description provided below.
 
30
  Description: {description}
 
31
  The dataset should include the following columns: {columns}
 
32
  Please provide the data in CSV format with a minimum of 100 rows per generation.
33
  Ensure that the data is realistic, does not contain any duplicate rows, and follows any specific conditions mentioned.
 
34
  Example Description:
35
  Generate a dataset for predicting house prices with columns: 'Size', 'Location', 'Number of Bedrooms', 'Price'
 
36
  Example Output:
37
  Size,Location,Number of Bedrooms,Price
38
  1200,Suburban,3,250000
39
  900,Urban,2,200000
40
  1500,Rural,4,300000
41
  ...
 
42
  Description:
43
  {description}
44
  Columns:
 
56
 
57
  API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B"
58
 
 
59
  generation_params = {
60
  "top_p": 0.90,
61
  "temperature": 0.8,
 
65
  }
66
 
67
  def generate_synthetic_data(description, columns):
68
+ try:
69
+ formatted_prompt = format_prompt(description, columns)
70
+ payload = {"inputs": formatted_prompt, "parameters": generation_params}
71
+ headers = {"Authorization": f"Bearer {HfFolder.get_token()}"}
72
+ response = requests.post(API_URL, headers=headers, json=payload)
73
+
74
+ if response.status_code == 200:
75
+ response_json = response.json()
76
+ if isinstance(response_json, list) and len(response_json) > 0 and "generated_text" in response_json[0]:
77
+ return response_json[0]["generated_text"]
78
+ else:
79
+ raise ValueError("Unexpected response format or missing 'generated_text' key")
80
  else:
81
+ print(f"Error details: {response.text}")
82
+ raise ValueError(f"API request failed with status code {response.status_code}: {response.text}")
83
+ except Exception as e:
84
+ print(f"Error in generate_synthetic_data: {e}")
85
+ return f"Error: {e}"
86
 
87
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
88
  data_frames = []
 
90
 
91
  for _ in range(num_iterations):
92
  generated_data = generate_synthetic_data(description, columns)
93
+ if "Error" in generated_data:
94
+ return generated_data
95
  df_synthetic = process_generated_data(generated_data)
96
  data_frames.append(df_synthetic)
97
 
 
106
  description = description.strip()
107
  columns = [col.strip() for col in columns.split(',')]
108
  df_synthetic = generate_large_synthetic_data(description, columns)
109
+ if isinstance(df_synthetic, str) and "Error" in df_synthetic:
110
+ return df_synthetic # Return the error message if any
111
  return df_synthetic.to_csv(index=False)
112
 
113
  # Gradio interface