Kevin Fink commited on
Commit
5a28d7d
·
1 Parent(s): 683b1e4
Files changed (1) hide show
  1. app.py +27 -13
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, TrainerCallback
4
  from transformers import DataCollatorForSeq2Seq
5
  from datasets import load_dataset
 
6
  import traceback
7
  from huggingface_hub import login
8
  from peft import get_peft_model, LoraConfig
@@ -43,30 +44,43 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
43
 
44
  # Tokenize the dataset
45
  def tokenize_function(examples):
46
-
47
- # Assuming 'text' is the input and 'target' is the expected output
48
  model_inputs = tokenizer(
49
  examples['text'],
50
- max_length=max_length, # Set to None for dynamic padding
51
- padding=False, # Disable padding here, we will handle it later
52
  truncation=True,
53
  )
54
 
55
- # Setup the decoder input IDs (shifted right)
56
  labels = tokenizer(
57
  examples['target'],
58
- max_length=max_length, # Set to None for dynamic padding
59
- padding=False, # Disable padding here, we will handle it later
60
  truncation=True,
61
- text_target=examples['target'] # Use text_target for target text
62
  )
63
 
64
  # Add labels to the model inputs
65
  model_inputs["labels"] = labels["input_ids"]
66
  return model_inputs
67
-
68
- tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=1)
69
- data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  # Set training arguments
72
  training_args = TrainingArguments(
@@ -94,8 +108,8 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
94
  trainer = Trainer(
95
  model=model,
96
  args=training_args,
97
- train_dataset=data_collator['train'],
98
- eval_dataset=data_collator['test'],
99
  #callbacks=[LoggingCallback()],
100
  )
101
 
 
3
  from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, TrainerCallback
4
  from transformers import DataCollatorForSeq2Seq
5
  from datasets import load_dataset
6
+ from datasets import concatenate_datasets
7
  import traceback
8
  from huggingface_hub import login
9
  from peft import get_peft_model, LoraConfig
 
44
 
45
  # Tokenize the dataset
46
  def tokenize_function(examples):
47
+ # Tokenize the input text
 
48
  model_inputs = tokenizer(
49
  examples['text'],
50
+ max_length=max_length,
51
+ padding=True,
52
  truncation=True,
53
  )
54
 
55
+ # Tokenize the target text
56
  labels = tokenizer(
57
  examples['target'],
58
+ max_length=max_length,
59
+ padding=True,
60
  truncation=True,
 
61
  )
62
 
63
  # Add labels to the model inputs
64
  model_inputs["labels"] = labels["input_ids"]
65
  return model_inputs
66
+
67
+ # Define a function to process the dataset in chunks
68
+ def tokenize_in_chunks(dataset, chunk_size=1000):
69
+ tokenized_datasets = []
70
+
71
+ for i in range(0, len(dataset), chunk_size):
72
+ chunk = dataset[i:i + chunk_size]
73
+ tokenized_chunk = chunk.map(tokenize_function, batched=True)
74
+ tokenized_datasets.append(tokenized_chunk)
75
+
76
+ # Concatenate all tokenized chunks into a single dataset
77
+ return tokenized_datasets
78
+
79
+ # Tokenize the dataset in chunks
80
+ tokenized_datasets = tokenize_in_chunks(dataset['train'], chunk_size=1000)
81
+
82
+ # If you want to combine all chunks into a single dataset
83
+ final_tokenized_dataset = concatenate_datasets(tokenized_datasets)
84
 
85
  # Set training arguments
86
  training_args = TrainingArguments(
 
108
  trainer = Trainer(
109
  model=model,
110
  args=training_args,
111
+ train_dataset=final_tokenized_dataset['train'],
112
+ eval_dataset=final_tokenized_dataset['test'],
113
  #callbacks=[LoggingCallback()],
114
  )
115