yakine commited on
Commit
26ca4a0
·
verified ·
1 Parent(s): f6629ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -14
app.py CHANGED
@@ -1,12 +1,12 @@
1
-
2
- import transformers
3
  import streamlit as st
4
  import pandas as pd
5
  import os
6
  import torch
 
7
  from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, pipeline
8
  from huggingface_hub import HfFolder
9
  from io import StringIO
 
10
 
11
  # Access the Hugging Face API token from environment variables
12
  hf_token = os.getenv('HF_API_TOKEN')
@@ -18,26 +18,25 @@ HfFolder.save_token(hf_token)
18
  # Set environment variable to avoid floating-point errors
19
  os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
20
 
21
- # Load the tokenizer and model
22
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
23
  model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
24
 
25
  # Create a pipeline for text generation using GPT-2
26
- text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer)
27
 
28
- # Load the Llama model and tokenizer at the beginning
29
- model_name = "meta-llama/Meta-Llama-3.1-8B"
30
- tokenizer_llama = AutoTokenizer.from_pretrained(model_name, token=hf_token)
31
  model_llama = AutoModelForCausalLM.from_pretrained(
32
- model_name,
33
- torch_dtype="auto",
34
- device_map="auto",
35
  token=hf_token
36
  )
37
 
38
 
39
 
40
-
41
  prompt_template = """\
42
  You are an expert in generating synthetic data for machine learning models.
43
  Your task is to generate a synthetic tabular dataset based on the description provided below.
@@ -100,19 +99,25 @@ def generate_synthetic_data(description, columns):
100
  # Return the generated synthetic data
101
  return generated_text
102
  except Exception as e:
103
- print(f"Error in generate_synthetic_data: {e}")
104
  return f"Error: {e}"
105
 
106
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
107
  data_frames = []
108
  num_iterations = num_rows // rows_per_generation
109
 
110
- for _ in range(num_iterations):
 
 
 
111
  generated_data = generate_synthetic_data(description, columns)
112
  if "Error" in generated_data:
113
  return generated_data
114
  df_synthetic = process_generated_data(generated_data)
115
  data_frames.append(df_synthetic)
 
 
 
116
 
117
  return pd.concat(data_frames, ignore_index=True)
118
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import os
4
  import torch
5
+ import transformers
6
  from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, pipeline
7
  from huggingface_hub import HfFolder
8
  from io import StringIO
9
+ from tqdm import tqdm # To display progress bar in Streamlit
10
 
11
  # Access the Hugging Face API token from environment variables
12
  hf_token = os.getenv('HF_API_TOKEN')
 
18
  # Set environment variable to avoid floating-point errors
19
  os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
20
 
21
+ # Load the GPT-2 tokenizer and model
22
+ tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
23
  model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
24
 
25
  # Create a pipeline for text generation using GPT-2
26
+ text_generator = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer_gpt2)
27
 
28
+ # Load the Llama-3 model and tokenizer once during startup
29
+ tokenizer_llama = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B", token=hf_token)
 
30
  model_llama = AutoModelForCausalLM.from_pretrained(
31
+ "meta-llama/Meta-Llama-3.1-8B",
32
+ torch_dtype= 'auto',
33
+ device_map= 'auto',
34
  token=hf_token
35
  )
36
 
37
 
38
 
39
+ # Define your prompt template
40
  prompt_template = """\
41
  You are an expert in generating synthetic data for machine learning models.
42
  Your task is to generate a synthetic tabular dataset based on the description provided below.
 
99
  # Return the generated synthetic data
100
  return generated_text
101
  except Exception as e:
102
+ st.error(f"Error in generate_synthetic_data: {e}")
103
  return f"Error: {e}"
104
 
105
  def generate_large_synthetic_data(description, columns, num_rows=1000, rows_per_generation=100):
106
  data_frames = []
107
  num_iterations = num_rows // rows_per_generation
108
 
109
+ # Create a progress bar
110
+ progress_bar = st.progress(0)
111
+
112
+ for i in tqdm(range(num_iterations)):
113
  generated_data = generate_synthetic_data(description, columns)
114
  if "Error" in generated_data:
115
  return generated_data
116
  df_synthetic = process_generated_data(generated_data)
117
  data_frames.append(df_synthetic)
118
+
119
+ # Update the progress bar
120
+ progress_bar.progress((i + 1) / num_iterations)
121
 
122
  return pd.concat(data_frames, ignore_index=True)
123