Spaces:

crpatel
/

deepseek-v3-text-generation

Running

File size: 5,927 Bytes

ab3efd5

import boto3
from boto3.s3.transfer import TransferConfig
from tqdm import tqdm
import os

def upload_file_to_s3(file_path, bucket_name, s3_prefix):
    

    class ProgressPercentage(object):
        def __init__(self, filename):
            self._filename = filename
            self._size = float(os.path.getsize(filename))
            self._seen_so_far = 0
            self._pbar = tqdm(total=self._size, unit='B', unit_scale=True, desc=f"Uploading {os.path.basename(filename)}")

        def __call__(self, bytes_amount):
            self._seen_so_far += bytes_amount
            self._pbar.update(bytes_amount)

    s3_client = boto3.client('s3')
    file_name = os.path.basename(file_path)
    s3_path = f"{s3_prefix}/{file_name}"
    
    # Configure multipart upload
    config = TransferConfig(
        multipart_threshold=1024 * 25,  # 25MB
        max_concurrency=10,
        multipart_chunksize=1024 * 25,  # 25MB
        use_threads=True
    )
    
    try:
        s3_client.upload_file(
            file_path, 
            bucket_name, 
            s3_path,
            Config=config,
            Callback=ProgressPercentage(file_path)
        )
        return f"s3://{bucket_name}/{s3_path}"
    except Exception as e:
        print(f"Failed to upload {file_path} to S3: {str(e)}")
        return None
    
max_lr = 1e-3
warmup_steps = 10
max_steps = 25000
import math
def get_lr_lambda(current_step, warmup_steps, max_steps, max_lr):
    """
    Learning rate scheduler with:
    1. Linear warmup
    2. Cosine decay
    3. Minimum learning rate of 10% of max_lr
    """
    min_lr = max_lr * 0.1  # Minimum learning rate (10% of max_lr)
    
    if current_step < warmup_steps:
        # Linear warmup
        return max_lr * (current_step + 1) / warmup_steps
    elif current_step > max_steps:
        # After max_steps, return minimum learning rate
        return min_lr
    else:
        # Cosine decay between warmup_steps and max_steps
        decay_ratio = (current_step - warmup_steps) / (max_steps - warmup_steps)
        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
        return min_lr + coeff * (max_lr - min_lr)


def plot_lr_schedule():
    """
    Helper function to visualize the learning rate schedule
    """
    import matplotlib.pyplot as plt
    steps = list(range(0, max_steps + 100))
    lrs = [get_lr_lambda(step, warmup_steps, max_steps, max_lr) for step in steps]
    
    plt.figure(figsize=(10, 5))
    plt.plot(steps, lrs)
    plt.title('Learning Rate Schedule')
    plt.xlabel('Steps')
    plt.ylabel('Learning Rate')
    plt.grid(True)
    plt.show()

def plot_training_loss(log_file_path, output_path=None):
    """
    Parse a training log file and plot the running average loss against batch steps.
    Also adds a trend line to visualize the overall training progress.
    
    Args:
        log_file_path (str): Path to the training log file
        output_path (str, optional): Path to save the plot as PNG. If None, displays the plot instead.
    """
    import re
    import matplotlib.pyplot as plt
    import numpy as np
    from scipy.optimize import curve_fit
    
    # Regular expression to extract batch number and loss
    pattern = r"Batch (\d+), Running Avg Loss: ([0-9.]+)"
    
    steps = []
    losses = []
    
    # Read and parse the log file
    with open(log_file_path, 'r') as file:
        for line in file:
            match = re.search(pattern, line)
            if match:
                batch_num = int(match.group(1))
                loss = float(match.group(2))
                steps.append(batch_num)
                losses.append(loss)
    
    if not steps:
        print("No loss data found in the log file.")
        return
    
    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.plot(steps, losses, 'b-', alpha=0.5, label='Running Avg Loss')
    
    # Add trend line (using polynomial fit)
    def poly_func(x, a, b, c):
        return a * x**2 + b * x + c
    
    # Convert to numpy arrays for curve fitting
    x_array = np.array(steps)
    y_array = np.array(losses)
    
    # Fit the curve
    try:
        popt, _ = curve_fit(poly_func, x_array, y_array)
        x_line = np.linspace(min(steps), max(steps), 1000)
        y_line = poly_func(x_line, *popt)
        plt.plot(x_line, y_line, 'r-', label='Trend Line')
    except Exception as e:
        print(f"Could not fit trend line: {e}")
        # Fallback to simple moving average for trend
        window_size = min(len(steps) // 10, 100) if len(steps) > 100 else len(steps) // 2
        if window_size > 0:
            moving_avg = np.convolve(y_array, np.ones(window_size)/window_size, mode='valid')
            plt.plot(steps[window_size-1:], moving_avg, 'r-', label='Moving Average Trend')
    
    # Add labels and title
    plt.xlabel('Batch Number')
    plt.ylabel('Running Average Loss')
    plt.title('Training Loss Over Time')
    plt.grid(True)
    plt.legend()
    
    # Add min and max loss annotations
    min_loss = min(losses)
    min_idx = losses.index(min_loss)
    max_loss = max(losses)
    max_idx = losses.index(max_loss)
    
    plt.annotate(f'Min: {min_loss:.5f}', 
                xy=(steps[min_idx], min_loss),
                xytext=(steps[min_idx], min_loss*1.05),
                arrowprops=dict(facecolor='green', shrink=0.05),
                fontsize=10)
    
    plt.annotate(f'Max: {max_loss:.5f}', 
                xy=(steps[max_idx], max_loss),
                xytext=(steps[max_idx], max_loss*0.95),
                arrowprops=dict(facecolor='red', shrink=0.05),
                fontsize=10)
    
    # Save or show the plot
    plt.tight_layout()
    if output_path:
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to {output_path}")
    else:
        plt.show()

if __name__ == "__main__":
    # plot_lr_schedule()
    plot_training_loss("training.log", "train_loss.png")