Spaces:

danielnashed
/

finetuning-llms

Sleeping

App Files Files Community

danielnashed commited on 29 days ago

Commit

0ea134e

verified ·

1 Parent(s): d5a5a2e

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -37

app.py CHANGED Viewed

@@ -6,12 +6,17 @@ import datasets
 from datasets import Dataset
 import json
 import pandas as pd
 import torch
 import wandb
 import os
 import sys
 from peft import LoraConfig, TaskType, get_peft_model, AutoPeftModelForCausalLM
 from sklearn.model_selection import train_test_split
 IS_COLAB = False
 if "google.colab" in sys.modules or "google.colab" in os.environ:
@@ -88,7 +93,7 @@ class LLMTrainingApp:
             self.model = get_peft_model(base_model, self.peft_config)
             params = self.model.get_nb_trainable_parameters()
             percent_trainable = round(100 * (params[0] / params[1]), 2)
-            return f"✅ Loaded model into memory! Base Model card: {json.dumps(self.base_models[model_name])} - % of trainable parameters for PEFT model: {percent_trainable}"
         except Exception as e:
               return f"❌ Failed to load model and/or tokenizer: {str(e)}"
@@ -107,7 +112,7 @@ class LLMTrainingApp:
                 model="gpt-4o",
                 messages=[
                     {
-                        "role": "user",
                         "content": """Given the following question-answer pairs, generate 10 similar pairs in the following json format below. Do not respond with anything other than the json.
                         ```json
                         [
@@ -121,6 +126,11 @@ class LLMTrainingApp:
                           }
                         ]
                         """
                     }
                 ]
             )
@@ -130,6 +140,8 @@ class LLMTrainingApp:
             print(f"clean response: {clean_response}")
             new_data = json.loads(clean_response)
             for i, row in enumerate(new_data):
               self.finetuning_dataset.append({"question": self.prompt_template.format(question=row["question"]), "answer": row["answer"]})
             # create df to display
             df = pd.DataFrame(new_data)
@@ -141,8 +153,14 @@ class LLMTrainingApp:
         try:
             # Tokenize the question and answer as input and target (labels) for causal LM
             encoding = self.tokenizer(examples['question'], examples['answer'], padding=True)
-            # Set the labels as the input_ids
-            encoding['labels'] = encoding['input_ids'].copy()
             return encoding
         except Exception as e:
             return f"❌ Failed to tokenize input: {str(e)}"
@@ -163,27 +181,42 @@ class LLMTrainingApp:
     def compute_bleu(self, eval_pred):
         predictions, labels = eval_pred
-        # # Flatten predictions and labels if they are in nested lists
-        # predictions = predictions.flatten()
-        # labels = labels.flatten()
-        # # Ensure that predictions and labels are integers
-        # predictions = predictions.astype(int)  # Convert to integer
-        # labels = labels.astype(int)  # Convert to integer
-        # # Decode the predicted tokens
-        # decoded_preds = self.tokenizer.decode(predictions, skip_special_tokens=True)
-        # decoded_labels = self.tokenizer.decode(labels, skip_special_tokens=True)
-        # result = self.metric.compute(predictions=[decoded_preds], references=[[decoded_labels]])
-        result = {"bleu": 1}
-        return result
-    def log_generator(self):
-        """ Continuously send logs to frontend during training """
-        for log in self.logging_callback.logs:
-            yield str(log)
     def train_model(self):
         try:
             tokenized_datasets = self.prepare_data_for_training()
             # Create training arguments
             training_args = TrainingArguments(
@@ -198,6 +231,8 @@ class LLMTrainingApp:
                 load_best_model_at_end=True,
             )
             # Create trainer & attach logging callback
             trainer = Trainer(
                 model=self.model,
@@ -210,17 +245,16 @@ class LLMTrainingApp:
                 callbacks=[self.logging_callback],
             )
             # Start training and yield logs in real-time
             trainer.train()
-            # for log in logging_callback.logs:
-            #     yield str(log)
             # Save trained model to HF
             self.model.save_pretrained(self.localpath) # save to local
             self.model.push_to_hub(f"{self.model_name}-lora")
-            return "✅ Training complete!"
         except Exception as e:
             return f"❌ Training failed: {str(e)}"
@@ -292,20 +326,12 @@ class LLMTrainingApp:
                         label="Golden + Synthetic Dataset"
                     )
                     generate_status = gr.Textbox(label="Dataset Generation Status")
-                    generate_data_btn = gr.Button("Generate Dataset", variant="primary")
                 generate_data_btn.click(self.extend_dataset, outputs=[generate_status, dataset_table])
             # Train Model & Visualize Loss
             with gr.Group():
-                gr.Markdown("### 5. Start Logging")
-                with gr.Column():
-                    train_status = gr.Textbox(label="Training Status", lines=10)
-                    train_btn = gr.Button("Train", variant="primary")
-                train_btn.click(self.log_generator, outputs=[train_status])
-            # Train Model & Visualize Loss
-            with gr.Group():
-                gr.Markdown("### 6. Train Model")
                 with gr.Column():
                     train_status = gr.Textbox(label="Training Status")
                     train_btn = gr.Button("Train", variant="primary")
@@ -313,7 +339,7 @@ class LLMTrainingApp:
             # Run Inference
             with gr.Group():
-                gr.Markdown("### 7. Run Inference")
                 with gr.Column():
                     user_prompt = gr.Textbox(label="Enter Prompt")
                     inference_btn = gr.Button("Run Inference", variant="primary")
@@ -326,4 +352,6 @@ class LLMTrainingApp:
 app = LLMTrainingApp()
 # Launch the Gradio app using the class method
-app.build_ui().launch()

 from datasets import Dataset
 import json
 import pandas as pd
+import numpy as np
 import torch
 import wandb
+import copy
 import os
 import sys
+import re
 from peft import LoraConfig, TaskType, get_peft_model, AutoPeftModelForCausalLM
 from sklearn.model_selection import train_test_split
+import nltk
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 IS_COLAB = False
 if "google.colab" in sys.modules or "google.colab" in os.environ:
             self.model = get_peft_model(base_model, self.peft_config)
             params = self.model.get_nb_trainable_parameters()
             percent_trainable = round(100 * (params[0] / params[1]), 2)
+            return f"✅ Loaded model into memory! Base Model card: {json.dumps(self.base_models[model_name])} - % of trainable parameters for PEFT model: {percent_trainable}%"
         except Exception as e:
               return f"❌ Failed to load model and/or tokenizer: {str(e)}"
                 model="gpt-4o",
                 messages=[
                     {
+                        "role": "system",
                         "content": """Given the following question-answer pairs, generate 10 similar pairs in the following json format below. Do not respond with anything other than the json.
                         ```json
                         [
                           }
                         ]
                         """
+                    },
+                    {
+                        "role": "user",
+                        "content": f"""Here are the question-answer pairs: {json.dumps(self.finetuning_dataset)}
+                        """
                     }
                 ]
             )
             print(f"clean response: {clean_response}")
             new_data = json.loads(clean_response)
             for i, row in enumerate(new_data):
+              row["question"] = row["question"].replace("### Question:", "").replace("### Answer:", "").strip()
+              row["answer"] = row["answer"].replace("### Answer:", "").strip()
               self.finetuning_dataset.append({"question": self.prompt_template.format(question=row["question"]), "answer": row["answer"]})
             # create df to display
             df = pd.DataFrame(new_data)
         try:
             # Tokenize the question and answer as input and target (labels) for causal LM
             encoding = self.tokenizer(examples['question'], examples['answer'], padding=True)
+            # Create labels (same as input_ids, but mask the non-answer part)
+            labels = copy.deepcopy(encoding["input_ids"])
+            for i in range(len(examples["question"])):
+                # print(examples["question"][i])
+                question_length = len(self.tokenizer(examples['question'][i], add_special_tokens=False)["input_ids"])
+                # print(f'question length: {question_length}')
+                labels[i][:question_length] = [-100] * question_length  # Mask question tokens
+            encoding["labels"] = labels
             return encoding
         except Exception as e:
             return f"❌ Failed to tokenize input: {str(e)}"
     def compute_bleu(self, eval_pred):
         predictions, labels = eval_pred
+        self.predictions = predictions
+        self.labels = labels
+        # Convert logits to token IDs using argmax
+        predictions = np.argmax(predictions, axis=-1)
+        # Ensure predictions and labels are integers within vocab range
+        predictions = np.clip(predictions, 0, self.tokenizer.vocab_size - 1).astype(int)
+        labels = np.clip(labels, 0, self.tokenizer.vocab_size - 1).astype(int)
+        scores = []
+        for prediction, label in zip(predictions, labels):
+            print(f"Prediction: {prediction}, Label: {label}")
+            # Remove leading 0's from array
+            prediction = prediction[np.argmax(prediction != 0):]
+            label = label[np.argmax(label != 0):]
+            # Decode predicted tokens
+            decoded_preds = self.tokenizer.decode(prediction, skip_special_tokens=True).split()
+            decoded_labels = self.tokenizer.decode(label, skip_special_tokens=True).split()
+            scores.append(sentence_bleu([decoded_labels], decoded_preds, smoothing_function=SmoothingFunction().method1))
+        average_score = sum(scores) / len(scores)
+        print(f"Average BLEU score: {average_score}")
+        return {"bleu": average_score}
+        # return score
+        # return {"bleu": 1}
     def train_model(self):
         try:
             tokenized_datasets = self.prepare_data_for_training()
+            print('finished preparing data for training')
             # Create training arguments
             training_args = TrainingArguments(
                 load_best_model_at_end=True,
             )
+            print('training arguments set...')
             # Create trainer & attach logging callback
             trainer = Trainer(
                 model=self.model,
                 callbacks=[self.logging_callback],
             )
+            print('trainer set...')
             # Start training and yield logs in real-time
             trainer.train()
             # Save trained model to HF
             self.model.save_pretrained(self.localpath) # save to local
             self.model.push_to_hub(f"{self.model_name}-lora")
+            return f"✅ Training complete!\n {json.dumps(self.logging_callback.logs)}"
         except Exception as e:
             return f"❌ Training failed: {str(e)}"
                         label="Golden + Synthetic Dataset"
                     )
                     generate_status = gr.Textbox(label="Dataset Generation Status")
+                    generate_data_btn = gr.Button("Extend Dataset", variant="primary")
                 generate_data_btn.click(self.extend_dataset, outputs=[generate_status, dataset_table])
             # Train Model & Visualize Loss
             with gr.Group():
+                gr.Markdown("### 5. Train Model")
                 with gr.Column():
                     train_status = gr.Textbox(label="Training Status")
                     train_btn = gr.Button("Train", variant="primary")
             # Run Inference
             with gr.Group():
+                gr.Markdown("### 6. Run Inference")
                 with gr.Column():
                     user_prompt = gr.Textbox(label="Enter Prompt")
                     inference_btn = gr.Button("Run Inference", variant="primary")
 app = LLMTrainingApp()
 # Launch the Gradio app using the class method
+app.build_ui().launch()