import gradio as gr import os import re import json from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline from transformers import DataCollatorWithPadding from huggingface_hub import login # Retrieve the Hugging Face token from the Space secrets token = os.getenv("HF_TOKEN") # Log in using the token login(token=token) # Load the dataset dataset = load_dataset('json', data_files='dataset.json') # Tokenize the dataset # Step 6: Tokenize the dataset tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", token=token) tokenizer.pad_token = tokenizer.eos_token # Set pad_token to eos_token # Tokenize the data and ensure labels are set def tokenize_function(examples): # Tokenize input text, adding labels for causal language modeling inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256) # The labels are the input_ids shifted by one token (for causal language modeling) inputs["labels"] = inputs["input_ids"].copy() # Copy the input_ids for labels return inputs tokenized_datasets = dataset.map(tokenize_function, batched=True) # Data collator data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Split dataset into training and validation tokenized_datasets = tokenized_datasets['train'].train_test_split(test_size=0.1) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["test"] # Fine-tune the model model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", token=token) training_args = TrainingArguments( output_dir="./results", eval_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=4, # Reduced batch size per_device_eval_batch_size=4, # Reduced batch size num_train_epochs=3, weight_decay=0.01, report_to="none", # Disables wandb logging fp16=True, # Enable mixed precision (use 16-bit instead of 32-bit precision) gradient_accumulation_steps=8, # Accumulate gradients over 8 steps ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator ) trainer.train() # Save the model model.save_pretrained("./fine-tuned-gpt2") tokenizer.save_pretrained("./fine-tuned-gpt2") # Evaluate the model #results = trainer.evaluate() #print(results) # Create a Gradio interface for text generation def generate_text(prompt): inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(inputs["input_ids"], max_length=50, num_return_sequences=1) return tokenizer.decode(outputs[0], skip_special_tokens=True) iface = gr.Interface(fn=generate_text, inputs="text", outputs="text") iface.launch()