File size: 6,851 Bytes
b4b3dd2
 
 
 
ab1497e
cc775c6
b4b3dd2
 
cc775c6
b4b3dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9dfc456
 
 
 
b4b3dd2
 
9dfc456
b4b3dd2
 
 
 
 
 
 
 
 
 
 
 
 
09ef223
b4b3dd2
 
cc775c6
 
 
9dfc456
87d150b
9dfc456
 
 
 
 
 
 
 
 
 
 
 
cc775c6
9dfc456
cc775c6
 
 
 
 
 
 
 
 
 
 
ab1497e
 
 
cc775c6
 
 
9dfc456
 
cc775c6
 
 
 
 
 
 
 
ab1497e
928b596
cc775c6
928b596
 
 
 
 
 
 
 
cc775c6
928b596
 
 
 
ab1497e
 
 
 
b4b3dd2
 
928b596
b4b3dd2
 
 
 
 
ba1722a
b4b3dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
928b596
b4b3dd2
 
 
ab1497e
928b596
ab1497e
 
 
 
b4b3dd2
928b596
ab1497e
b4b3dd2
 
 
928b596
b4b3dd2
928b596
ab1497e
928b596
ab1497e
928b596
 
 
 
 
 
b4b3dd2
 
 
 
 
cc775c6
9dfc456
 
 
 
 
 
cc775c6
928b596
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
import os
import json
import torch
import subprocess
import sys
from dotenv import load_dotenv
import logging
import threading

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("app.log")
    ]
)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get script directory - important for Hugging Face Space paths
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
BASE_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, "."))

# Load config file
def load_config(config_path="transformers_config.json"):
    config_path = os.path.join(BASE_DIR, config_path)
    try:
        with open(config_path, 'r') as f:
            config = json.load(f)
        return config
    except Exception as e:
        logger.error(f"Error loading config: {str(e)}")
        return {}

# Load configuration
config = load_config()
model_config = config.get("model_config", {})

# Model details from config
MODEL_NAME = model_config.get("model_name_or_path", "unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit")
SPACE_NAME = os.getenv("HF_SPACE_NAME", "phi4training")

# Function to run training in a thread and stream output to container logs
def run_training():
    """Run the training script and stream its output to container logs"""
    # Locate training script using absolute path
    training_script = os.path.join(BASE_DIR, "run_cloud_training.py")
    
    # Check if file exists and log the path
    if not os.path.exists(training_script):
        print(f"ERROR: Training script not found at: {training_script}")
        print(f"Current directory: {os.getcwd()}")
        print("Available files:")
        for file in os.listdir(BASE_DIR):
            print(f" - {file}")
        return
    
    print(f"Found training script at: {training_script}")
    
    process = subprocess.Popen(
        [sys.executable, training_script],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
        bufsize=1
    )
    
    # Stream output directly to sys.stdout (container logs)
    for line in iter(process.stdout.readline, ''):
        sys.stdout.write(line)
        sys.stdout.flush()

# Function to start the training process
def start_training():
    try:
        # Print directly to container logs
        print("\n===== STARTING TRAINING PROCESS =====\n")
        print(f"Model: {MODEL_NAME}")
        print(f"Base directory: {BASE_DIR}")
        print(f"Current working directory: {os.getcwd()}")
        print(f"Training with configuration from transformers_config.json")
        print("Training logs will appear below:")
        print("=" * 50)
        
        # Start training in a separate thread
        training_thread = threading.Thread(target=run_training)
        training_thread.daemon = True  # Allow the thread to be terminated when app exits
        training_thread.start()
        
        # Log the start of training
        logger.info("Training started in background thread")
        
        return """
        ✅ Training process initiated! 
        
        The model is now being fine-tuned in the background.
        
        To monitor progress:
        1. Check the Hugging Face space logs in the "Logs" tab
        2. You should see training output appearing directly in the logs
        3. The process will continue running in the background
        
        NOTE: This is a research training phase only, no model outputs will be available.
        """
    except Exception as e:
        logger.error(f"Error starting training: {str(e)}")
        return f"❌ Error starting training: {str(e)}"

# Create Gradio interface - training status only, no model outputs
with gr.Blocks(css="footer {visibility: hidden}") as demo:
    gr.Markdown(f"# {SPACE_NAME}: Research Training Dashboard")
    
    with gr.Row():
        with gr.Column():
            status = gr.Markdown(
                f"""
                ## DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit Training Dashboard
                
                **Model**: {MODEL_NAME}
                **Dataset**: phi4-cognitive-dataset
                
                This is a multidisciplinary research training phase. The model is not available for interactive use.
                
                ### Training Configuration:
                - **Epochs**: {config.get("training_config", {}).get("num_train_epochs", 3)}
                - **Batch Size**: {config.get("training_config", {}).get("per_device_train_batch_size", 2)}
                - **Gradient Accumulation Steps**: {config.get("training_config", {}).get("gradient_accumulation_steps", 4)}
                - **Learning Rate**: {config.get("training_config", {}).get("learning_rate", 2e-5)}
                - **Max Sequence Length**: {config.get("training_config", {}).get("max_seq_length", 2048)}
                
                ⚠️ **NOTE**: This space does not provide model outputs during the research training phase.
                All logs are available in the Hugging Face "Logs" tab.
                """
            )
    
    with gr.Row():
        # Add button for starting training
        start_btn = gr.Button("Start Training", variant="primary")
        
    # Output area for training start messages
    training_output = gr.Markdown("")
    
    # Connect start button to function
    start_btn.click(start_training, outputs=training_output)
    
    gr.Markdown("""
    ### Research Training Information
    
    This model is being fine-tuned on research-focused datasets and is not available for interactive querying.
    The training process will run in the background and logs will be available in the Hugging Face UI.
    
    #### Instructions
    1. Click "Start Training" to begin the fine-tuning process
    2. Monitor progress in the Hugging Face "Logs" tab
    3. Training metrics and results will be saved to the output directory
    
    #### About This Project
    The model is being fine-tuned on the phi4-cognitive-dataset with a focus on research capabilities.
    This training phase does not include any interactive features or output generation.
    """)

# Launch the interface
if __name__ == "__main__":
    # Start Gradio with minimal features
    print("\n===== RESEARCH TRAINING DASHBOARD STARTED =====\n")
    print(f"Base directory: {BASE_DIR}")
    print(f"Current working directory: {os.getcwd()}")
    print("Available files:")
    for file in os.listdir(BASE_DIR):
        print(f" - {file}")
    print("\nClick 'Start Training' to begin the fine-tuning process")
    print("All training output will appear in these logs")
    logger.info("Starting research training dashboard")
    demo.launch(share=False)