danielnashed commited on
Commit
0ea134e
Β·
verified Β·
1 Parent(s): d5a5a2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -37
app.py CHANGED
@@ -6,12 +6,17 @@ import datasets
6
  from datasets import Dataset
7
  import json
8
  import pandas as pd
 
9
  import torch
10
  import wandb
 
11
  import os
12
  import sys
 
13
  from peft import LoraConfig, TaskType, get_peft_model, AutoPeftModelForCausalLM
14
  from sklearn.model_selection import train_test_split
 
 
15
 
16
  IS_COLAB = False
17
  if "google.colab" in sys.modules or "google.colab" in os.environ:
@@ -88,7 +93,7 @@ class LLMTrainingApp:
88
  self.model = get_peft_model(base_model, self.peft_config)
89
  params = self.model.get_nb_trainable_parameters()
90
  percent_trainable = round(100 * (params[0] / params[1]), 2)
91
- return f"βœ… Loaded model into memory! Base Model card: {json.dumps(self.base_models[model_name])} - % of trainable parameters for PEFT model: {percent_trainable}"
92
  except Exception as e:
93
  return f"❌ Failed to load model and/or tokenizer: {str(e)}"
94
 
@@ -107,7 +112,7 @@ class LLMTrainingApp:
107
  model="gpt-4o",
108
  messages=[
109
  {
110
- "role": "user",
111
  "content": """Given the following question-answer pairs, generate 10 similar pairs in the following json format below. Do not respond with anything other than the json.
112
  ```json
113
  [
@@ -121,6 +126,11 @@ class LLMTrainingApp:
121
  }
122
  ]
123
  """
 
 
 
 
 
124
  }
125
  ]
126
  )
@@ -130,6 +140,8 @@ class LLMTrainingApp:
130
  print(f"clean response: {clean_response}")
131
  new_data = json.loads(clean_response)
132
  for i, row in enumerate(new_data):
 
 
133
  self.finetuning_dataset.append({"question": self.prompt_template.format(question=row["question"]), "answer": row["answer"]})
134
  # create df to display
135
  df = pd.DataFrame(new_data)
@@ -141,8 +153,14 @@ class LLMTrainingApp:
141
  try:
142
  # Tokenize the question and answer as input and target (labels) for causal LM
143
  encoding = self.tokenizer(examples['question'], examples['answer'], padding=True)
144
- # Set the labels as the input_ids
145
- encoding['labels'] = encoding['input_ids'].copy()
 
 
 
 
 
 
146
  return encoding
147
  except Exception as e:
148
  return f"❌ Failed to tokenize input: {str(e)}"
@@ -163,27 +181,42 @@ class LLMTrainingApp:
163
 
164
  def compute_bleu(self, eval_pred):
165
  predictions, labels = eval_pred
166
- # # Flatten predictions and labels if they are in nested lists
167
- # predictions = predictions.flatten()
168
- # labels = labels.flatten()
169
- # # Ensure that predictions and labels are integers
170
- # predictions = predictions.astype(int) # Convert to integer
171
- # labels = labels.astype(int) # Convert to integer
172
- # # Decode the predicted tokens
173
- # decoded_preds = self.tokenizer.decode(predictions, skip_special_tokens=True)
174
- # decoded_labels = self.tokenizer.decode(labels, skip_special_tokens=True)
175
- # result = self.metric.compute(predictions=[decoded_preds], references=[[decoded_labels]])
176
- result = {"bleu": 1}
177
- return result
178
-
179
- def log_generator(self):
180
- """ Continuously send logs to frontend during training """
181
- for log in self.logging_callback.logs:
182
- yield str(log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  def train_model(self):
185
  try:
186
  tokenized_datasets = self.prepare_data_for_training()
 
187
 
188
  # Create training arguments
189
  training_args = TrainingArguments(
@@ -198,6 +231,8 @@ class LLMTrainingApp:
198
  load_best_model_at_end=True,
199
  )
200
 
 
 
201
  # Create trainer & attach logging callback
202
  trainer = Trainer(
203
  model=self.model,
@@ -210,17 +245,16 @@ class LLMTrainingApp:
210
  callbacks=[self.logging_callback],
211
  )
212
 
 
 
213
  # Start training and yield logs in real-time
214
  trainer.train()
215
 
216
- # for log in logging_callback.logs:
217
- # yield str(log)
218
-
219
  # Save trained model to HF
220
  self.model.save_pretrained(self.localpath) # save to local
221
  self.model.push_to_hub(f"{self.model_name}-lora")
222
 
223
- return "βœ… Training complete!"
224
  except Exception as e:
225
  return f"❌ Training failed: {str(e)}"
226
 
@@ -292,20 +326,12 @@ class LLMTrainingApp:
292
  label="Golden + Synthetic Dataset"
293
  )
294
  generate_status = gr.Textbox(label="Dataset Generation Status")
295
- generate_data_btn = gr.Button("Generate Dataset", variant="primary")
296
  generate_data_btn.click(self.extend_dataset, outputs=[generate_status, dataset_table])
297
 
298
  # Train Model & Visualize Loss
299
  with gr.Group():
300
- gr.Markdown("### 5. Start Logging")
301
- with gr.Column():
302
- train_status = gr.Textbox(label="Training Status", lines=10)
303
- train_btn = gr.Button("Train", variant="primary")
304
- train_btn.click(self.log_generator, outputs=[train_status])
305
-
306
- # Train Model & Visualize Loss
307
- with gr.Group():
308
- gr.Markdown("### 6. Train Model")
309
  with gr.Column():
310
  train_status = gr.Textbox(label="Training Status")
311
  train_btn = gr.Button("Train", variant="primary")
@@ -313,7 +339,7 @@ class LLMTrainingApp:
313
 
314
  # Run Inference
315
  with gr.Group():
316
- gr.Markdown("### 7. Run Inference")
317
  with gr.Column():
318
  user_prompt = gr.Textbox(label="Enter Prompt")
319
  inference_btn = gr.Button("Run Inference", variant="primary")
@@ -326,4 +352,6 @@ class LLMTrainingApp:
326
  app = LLMTrainingApp()
327
 
328
  # Launch the Gradio app using the class method
329
- app.build_ui().launch()
 
 
 
6
  from datasets import Dataset
7
  import json
8
  import pandas as pd
9
+ import numpy as np
10
  import torch
11
  import wandb
12
+ import copy
13
  import os
14
  import sys
15
+ import re
16
  from peft import LoraConfig, TaskType, get_peft_model, AutoPeftModelForCausalLM
17
  from sklearn.model_selection import train_test_split
18
+ import nltk
19
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
20
 
21
  IS_COLAB = False
22
  if "google.colab" in sys.modules or "google.colab" in os.environ:
 
93
  self.model = get_peft_model(base_model, self.peft_config)
94
  params = self.model.get_nb_trainable_parameters()
95
  percent_trainable = round(100 * (params[0] / params[1]), 2)
96
+ return f"βœ… Loaded model into memory! Base Model card: {json.dumps(self.base_models[model_name])} - % of trainable parameters for PEFT model: {percent_trainable}%"
97
  except Exception as e:
98
  return f"❌ Failed to load model and/or tokenizer: {str(e)}"
99
 
 
112
  model="gpt-4o",
113
  messages=[
114
  {
115
+ "role": "system",
116
  "content": """Given the following question-answer pairs, generate 10 similar pairs in the following json format below. Do not respond with anything other than the json.
117
  ```json
118
  [
 
126
  }
127
  ]
128
  """
129
+ },
130
+ {
131
+ "role": "user",
132
+ "content": f"""Here are the question-answer pairs: {json.dumps(self.finetuning_dataset)}
133
+ """
134
  }
135
  ]
136
  )
 
140
  print(f"clean response: {clean_response}")
141
  new_data = json.loads(clean_response)
142
  for i, row in enumerate(new_data):
143
+ row["question"] = row["question"].replace("### Question:", "").replace("### Answer:", "").strip()
144
+ row["answer"] = row["answer"].replace("### Answer:", "").strip()
145
  self.finetuning_dataset.append({"question": self.prompt_template.format(question=row["question"]), "answer": row["answer"]})
146
  # create df to display
147
  df = pd.DataFrame(new_data)
 
153
  try:
154
  # Tokenize the question and answer as input and target (labels) for causal LM
155
  encoding = self.tokenizer(examples['question'], examples['answer'], padding=True)
156
+ # Create labels (same as input_ids, but mask the non-answer part)
157
+ labels = copy.deepcopy(encoding["input_ids"])
158
+ for i in range(len(examples["question"])):
159
+ # print(examples["question"][i])
160
+ question_length = len(self.tokenizer(examples['question'][i], add_special_tokens=False)["input_ids"])
161
+ # print(f'question length: {question_length}')
162
+ labels[i][:question_length] = [-100] * question_length # Mask question tokens
163
+ encoding["labels"] = labels
164
  return encoding
165
  except Exception as e:
166
  return f"❌ Failed to tokenize input: {str(e)}"
 
181
 
182
  def compute_bleu(self, eval_pred):
183
  predictions, labels = eval_pred
184
+ self.predictions = predictions
185
+ self.labels = labels
186
+
187
+ # Convert logits to token IDs using argmax
188
+ predictions = np.argmax(predictions, axis=-1)
189
+
190
+ # Ensure predictions and labels are integers within vocab range
191
+ predictions = np.clip(predictions, 0, self.tokenizer.vocab_size - 1).astype(int)
192
+ labels = np.clip(labels, 0, self.tokenizer.vocab_size - 1).astype(int)
193
+
194
+ scores = []
195
+
196
+ for prediction, label in zip(predictions, labels):
197
+ print(f"Prediction: {prediction}, Label: {label}")
198
+
199
+ # Remove leading 0's from array
200
+ prediction = prediction[np.argmax(prediction != 0):]
201
+ label = label[np.argmax(label != 0):]
202
+
203
+ # Decode predicted tokens
204
+ decoded_preds = self.tokenizer.decode(prediction, skip_special_tokens=True).split()
205
+ decoded_labels = self.tokenizer.decode(label, skip_special_tokens=True).split()
206
+
207
+ scores.append(sentence_bleu([decoded_labels], decoded_preds, smoothing_function=SmoothingFunction().method1))
208
+
209
+ average_score = sum(scores) / len(scores)
210
+ print(f"Average BLEU score: {average_score}")
211
+ return {"bleu": average_score}
212
+
213
+ # return score
214
+ # return {"bleu": 1}
215
 
216
  def train_model(self):
217
  try:
218
  tokenized_datasets = self.prepare_data_for_training()
219
+ print('finished preparing data for training')
220
 
221
  # Create training arguments
222
  training_args = TrainingArguments(
 
231
  load_best_model_at_end=True,
232
  )
233
 
234
+ print('training arguments set...')
235
+
236
  # Create trainer & attach logging callback
237
  trainer = Trainer(
238
  model=self.model,
 
245
  callbacks=[self.logging_callback],
246
  )
247
 
248
+ print('trainer set...')
249
+
250
  # Start training and yield logs in real-time
251
  trainer.train()
252
 
 
 
 
253
  # Save trained model to HF
254
  self.model.save_pretrained(self.localpath) # save to local
255
  self.model.push_to_hub(f"{self.model_name}-lora")
256
 
257
+ return f"βœ… Training complete!\n {json.dumps(self.logging_callback.logs)}"
258
  except Exception as e:
259
  return f"❌ Training failed: {str(e)}"
260
 
 
326
  label="Golden + Synthetic Dataset"
327
  )
328
  generate_status = gr.Textbox(label="Dataset Generation Status")
329
+ generate_data_btn = gr.Button("Extend Dataset", variant="primary")
330
  generate_data_btn.click(self.extend_dataset, outputs=[generate_status, dataset_table])
331
 
332
  # Train Model & Visualize Loss
333
  with gr.Group():
334
+ gr.Markdown("### 5. Train Model")
 
 
 
 
 
 
 
 
335
  with gr.Column():
336
  train_status = gr.Textbox(label="Training Status")
337
  train_btn = gr.Button("Train", variant="primary")
 
339
 
340
  # Run Inference
341
  with gr.Group():
342
+ gr.Markdown("### 6. Run Inference")
343
  with gr.Column():
344
  user_prompt = gr.Textbox(label="Enter Prompt")
345
  inference_btn = gr.Button("Run Inference", variant="primary")
 
352
  app = LLMTrainingApp()
353
 
354
  # Launch the Gradio app using the class method
355
+ app.build_ui().launch()
356
+
357
+