nikunjcepatel commited on
Commit
85e35cf
·
verified ·
1 Parent(s): ba6c014

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
3
+ from datasets import load_dataset
4
+ import os
5
+
6
+ # Retrieve the Hugging Face token from the Space secrets
7
+ token = os.getenv("HF_TOKEN")
8
+
9
+ # Log in using the token
10
+ login(token=token)
11
+
12
+ # Load the dataset
13
+ dataset = load_dataset('json', data_files='dataset.json')
14
+
15
+ # Tokenize the dataset
16
+ # Step 6: Tokenize the dataset
17
+ tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", token=token)
18
+ tokenizer.pad_token = tokenizer.eos_token # Set pad_token to eos_token
19
+
20
+ # Tokenize the data and ensure labels are set
21
+ def tokenize_function(examples):
22
+ # Tokenize input text, adding labels for causal language modeling
23
+ inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
24
+
25
+ # The labels are the input_ids shifted by one token (for causal language modeling)
26
+ inputs["labels"] = inputs["input_ids"].copy() # Copy the input_ids for labels
27
+ return inputs
28
+
29
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
30
+
31
+ # Data collator
32
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
33
+
34
+ # Split dataset into training and validation
35
+ tokenized_datasets = tokenized_datasets['train'].train_test_split(test_size=0.1)
36
+ train_dataset = tokenized_datasets["train"]
37
+ eval_dataset = tokenized_datasets["test"]
38
+
39
+ # Fine-tune the model
40
+ model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", token=token)
41
+
42
+ training_args = TrainingArguments(
43
+ output_dir="./results",
44
+ eval_strategy="epoch",
45
+ learning_rate=2e-5,
46
+ per_device_train_batch_size=2, # Reduced batch size
47
+ per_device_eval_batch_size=2, # Reduced batch size
48
+ num_train_epochs=1,
49
+ weight_decay=0.01,
50
+ report_to="none", # Disables wandb logging
51
+ fp16=True, # Enable mixed precision (use 16-bit instead of 32-bit precision)
52
+ gradient_accumulation_steps=4, # Accumulate gradients over 8 steps
53
+ )
54
+
55
+ trainer = Trainer(
56
+ model=model,
57
+ args=training_args,
58
+ train_dataset=train_dataset,
59
+ eval_dataset=eval_dataset,
60
+ data_collator=data_collator
61
+ )
62
+
63
+ trainer.train()
64
+
65
+ # Save the model
66
+ model.save_pretrained("./fine-tuned-llama")
67
+ tokenizer.save_pretrained("./fine-tuned-llama")
68
+
69
+ # Evaluate the model
70
+ #results = trainer.evaluate()
71
+ #print(results)
72
+
73
+ # Create a Gradio interface for text generation
74
+ def generate_text(prompt):
75
+ inputs = tokenizer(prompt, return_tensors="pt")
76
+ outputs = model.generate([**inputs)
77
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
78
+
79
+ iface = gr.Interface(fn=generate_text, inputs="text", outputs="text")
80
+ iface.launch()