Spaces:
Running
Running
File size: 11,900 Bytes
3dea1ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 |
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tiktoken
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import os
# Define hyperparameters
vocab_size = 50257
n_heads = 8
n_layers = 6
head_size = 64
n_embd = 512
block_size = 128
dropout = 0.1
learning_rate = 3e-4
weight_decay = 0.1
# Set Hugging Face cache directories on the external disk
os.environ['HF_HOME'] = '/media/adrian/FamilyBackup/adrian_ai_workspace/hf_cache'
os.environ['HF_DATASETS_CACHE'] = '/media/adrian/FamilyBackup/adrian_ai_workspace/datasets_cache'
# Load the BookCorpus dataset and ensure it's cached on the external disk
dataset = load_dataset("bookcorpus", cache_dir='/media/adrian/FamilyBackup/adrian_ai_workspace/')
# Keep only 10% of the dataset
total_samples = len(dataset["train"])
one_percent_samples = int(total_samples * 0.001)
dataset_subset = dataset["train"].select(range(one_percent_samples)) # Select only the first 1%
# Split the subset into train (90%) and test (10%)
split_dataset = dataset_subset.train_test_split(test_size=0.1) # 10% for testing
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]
# Print the size of the train and the test sets
print(f"Train size: {len(train_dataset)}")
print(f"Test size: {len(test_dataset)}")
# Initialize the tiktoken encoder
enc = tiktoken.get_encoding("gpt2")
# Define the tokenization function
def tokenize_function(examples):
return {
"input_ids": [enc.encode(text) for text in examples["text"]],
"attention_mask": [[1] * len(enc.encode(text)) for text in examples["text"]]
}
# Function to pad or truncate sequences
def pad_or_truncate(batch):
max_length = 512
for key in ['input_ids', 'attention_mask']:
batch[key] = [
seq[:max_length] + [0] * (max_length - len(seq)) if len(seq) < max_length else seq[:max_length]
for seq in batch[key]
]
return batch
# Tokenize and process the datasets
def process_dataset(dataset, split_name):
# Tokenize
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
num_proc=20,
remove_columns=dataset.column_names
)
# Pad or truncate
processed_dataset = tokenized_dataset.map(
pad_or_truncate,
batched=True,
num_proc=20,
)
# Set format to PyTorch tensors
processed_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
return processed_dataset
# Process both train and test datasets
train_dataset = process_dataset(train_dataset, "train")
test_dataset = process_dataset(test_dataset, "test")
# Print some examples
print(f"Example train data: {train_dataset[0]}")
print(f"Example test data: {test_dataset[0]}")
# Create DataLoaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)
# Print an example batch
for batch in train_loader:
print(f"Batch input ids shape: {batch['input_ids'].shape}")
print(f"Batch attention mask shape: {batch['attention_mask'].shape}")
break
# Print an example batch
for batch in train_loader:
print(f"Batch input ids shape: {batch['input_ids'].shape}")
print(f"Batch attention mask shape: {batch['attention_mask'].shape}")
break
# Define model
class Head(nn.Module):
""" One head of self-attention """
def __init__(self, head_size, n_embd, block_size, dropout):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias=False)
self.query = nn.Linear(n_embd, head_size, bias=False)
self.value = nn.Linear(n_embd, head_size, bias=False)
self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B, T, C = x.shape
k = self.key(x)
q = self.query(x)
v = self.value(x)
assert C == self.key.in_features, f"Input size {C} doesn't match expected size {self.key.in_features}"
wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei = self.dropout(wei)
out = wei @ v
return out
class MultiHeadAttention(nn.Module):
""" Multiple heads of self-attention in parallel """
def __init__(self, n_heads, head_size, n_embd, dropout):
super().__init__()
self.heads = nn.ModuleList([Head(head_size, n_embd, block_size, dropout) for _ in range(n_heads)])
self.proj = nn.Linear(n_heads * head_size, n_embd)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# Collects the outputs from each head
head_outputs = [head(x) for head in self.heads]
# Concatenate the outputs
concatenated = torch.cat(head_outputs, dim=-1)
# Apply linear transformation and dropout
out = self.proj(concatenated)
out = self.dropout(out)
return out
class FeedForward(nn.Module):
""" A simple linear layer followed by non-linearity """
def __init__(self, n_embd, dropout=0.1, expansion_factor=4):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_embd, expansion_factor * n_embd),
nn.ReLU(),
nn.Linear(expansion_factor * n_embd, n_embd),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)
class Block(nn.Module):
""" Transformer block: communication followed by computation """
def __init__(self, n_embd, n_head, dropout=0.1):
# n_embed: embedding dimension, n_head: the number of heads we'd like
super().__init__()
head_size = n_embd // n_head
self.sa = MultiHeadAttention(n_head, head_size, n_embd, dropout)
self.ffwd = FeedForward(n_embd, dropout)
self.ln1 = nn.LayerNorm(n_embd)
self.ln2 = nn.LayerNorm(n_embd)
def forward(self, x):
x = x + self.sa(self.ln1(x))
x = x + self.ffwd(self.ln2(x))
return x
class GPTLanguageModel(nn.Module):
def __init__(self, vocab_size, n_embd, block_size, n_layer, n_head, device="cpu"):
super().__init__()
self.device = device
self.block_size = block_size
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
self.position_embedding_table = nn.Embedding(block_size, n_embd)
self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embd)
self.lm_head = nn.Linear(n_embd, vocab_size)
self.apply(self._init_weights)
def _init_weights(self, module):
if isinstance(module, nn.Linear):
nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, mean=0.1, std=0.02)
def forward(self, idx, targets=None):
B, T = idx.shape
# Truncate sequence length to block_size
T = min(T, self.block_size)
idx = idx[:, :T]
# Get token embeddings for input indices
tok_emb = self.token_embedding_table(idx) # (B, T, C)
# Get position embeddings (truncate to match input length)
pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, C)
# Combine token and position embeddings
x = tok_emb + pos_emb.unsqueeze(0) # (B, T, C)
# Apply transformer blocks
x = self.blocks(x) # (B, T, C)
# Final layer normalization
x = self.ln_f(x) # (B, T, C)
# Get logits for vocabulary prediction
logits = self.lm_head(x) # (B, T, vocab_size)
# Optionally calculate loss if targets are provided
loss = None
if targets is not None:
# Ensure targets are the same size as logits
targets = targets[:, :T]
B, T, C = logits.shape
logits = logits.reshape(B*T, C)
targets = targets.reshape(B*T)
loss = F.cross_entropy(logits, targets)
return logits, loss
@torch.no_grad()
def generate(self, idx, max_new_tokens):
for _ in range(max_new_tokens):
idx_cond = idx[:, -self.block_size:] # Crop to the last block_size tokens
logits, _ = self(idx_cond) # Get Predictions
logits = logits[:, -1, :] # Focus on the last time step
probs = F.softmax(logits, dim=-1) # Get probabilities
idx_next = torch.multinomial(probs, num_samples=1) # Samples from the distribution
idx = torch.cat((idx, idx_next), dim=1) # Append sampled index
return idx
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print (f"Using device: {device}")
# Instantiate the model
model = GPTLanguageModel(vocab_size, n_embd, block_size, n_layers, n_heads, device=device)
# Move the model to the GPU (if available)
model = model.to(device)
# Define criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Training loop with progress reporting
def batch_gh(model, criterion, optimizer, train_loader, test_loader, epochs):
train_losses = np.zeros(epochs)
test_losses = np.zeros(epochs)
for it in range(epochs):
model.train() # Set model to training mode
t0 = datetime.now()
train_loss = []
for i, batch in enumerate(train_loader):
inputs = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
# Create targets by shifting inputs by one position
targets = inputs[:, 1:].contiguous()
inputs = inputs[:, :-1].contiguous()
# Zero parameter gradients
optimizer.zero_grad()
# Forward pass
outputs, loss = model(inputs, targets)
# Backward and optimize
loss.backward()
optimizer.step()
train_loss.append(loss.item())
# Print progress every 100 batches
if (i + 1) % 100 == 0:
print(f'Epoch {it + 1}/{epochs}, Batch {i + 1}/{len(train_loader)}, Loss: {loss.item():.4f}')
# Get average train_loss
train_loss = np.mean(train_loss)
model.eval() # Set model to evaluation mode
test_loss = []
with torch.no_grad():
for batch in test_loader:
inputs = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
# Create targets by shifting inputs by one position
targets = inputs[:, 1:].contiguous()
inputs = inputs[:, :-1].contiguous()
outputs, loss = model(inputs, targets)
test_loss.append(loss.item())
test_loss = np.mean(test_loss)
# Save losses
train_losses[it] = train_loss
test_losses[it] = test_loss
dt = datetime.now() - t0
print(f'Epoch {it + 1}/{epochs}, Train Loss: {train_loss:.4f}, '
f'Test Loss: {test_loss:.4f}, Duration: {dt}')
return train_losses, test_losses
# Run the training
train_losses, test_losses = batch_gh(model, criterion, optimizer, train_loader, test_loader, epochs=2)
# Plot loss
plt.plot(train_losses, label="train_loss")
plt.plot(test_losses, label="test_loss")
plt.legend()
plt.show()
# Save model weights
model_save_path = "/home/adrian/Documents/StoryCrafterLLM/model_weights.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")
|