Spaces:

padmanabhbosamia
/

phi2-grpo

Running

File size: 12,224 Bytes

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from rich.console import Console
import time

# Initialize rich console for better logging
console = Console()

# Load the model and tokenizer with the same configuration as training
console.print("[bold green]Loading model and tokenizer...[/bold green]")

# Load model with memory optimizations
model = AutoModelForCausalLM.from_pretrained(
    "./fine-tuned-model",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,  # Use float16 for memory efficiency
    low_cpu_mem_usage=True,  # Add this for better memory handling
)
tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# Load base model for before/after comparison
console.print("[bold green]Loading base model for comparison...[/bold green]")
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,  # Add this for better memory handling
)

def generate_response(

    prompt,

    max_length=128,  # Match training max_length

    temperature=0.7,

    top_p=0.9,

    num_generations=2,  # Match training num_generations

    repetition_penalty=1.1,

    do_sample=True,

    show_comparison=True,  # New parameter for comparison toggle

):
    try:
        # Get the device of the model
        device = next(model.parameters()).device
        
        # Tokenize the input
        inputs = tokenizer(prompt, return_tensors="pt", padding=True)
        
        # Move inputs to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate response from fine-tuned model
        with torch.no_grad():  # Disable gradient computation
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_length,
                do_sample=do_sample,
                temperature=temperature,
                top_p=top_p,
                num_return_sequences=num_generations,
                repetition_penalty=repetition_penalty,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        # Decode and return the responses
        responses = []
        for output in outputs:
            response = tokenizer.decode(output, skip_special_tokens=True)
            responses.append(response)
        
        fine_tuned_response = "\n\n---\n\n".join(responses)
        
        if show_comparison:
            # Generate response from base model
            with torch.no_grad():
                base_outputs = base_model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    do_sample=do_sample,
                    temperature=temperature,
                    top_p=top_p,
                    num_return_sequences=1,  # Only one for comparison
                    repetition_penalty=repetition_penalty,
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
            
            base_response = tokenizer.decode(base_outputs[0], skip_special_tokens=True)
            
            return f"""

### Before Fine-tuning (Base Model)

{base_response}



### After Fine-tuning

{fine_tuned_response}

"""
        else:
            return fine_tuned_response
            
    except Exception as e:
        console.print(f"[bold red]Error during generation: {str(e)}[/bold red]")
        return f"Error: {str(e)}"

# Create custom CSS for better UI
custom_css = """

.gradio-container {

    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;

}

.container {

    max-width: 800px;

    margin: auto;

    padding: 20px;

}

.title {

    text-align: center;

    color: #2c3e50;

    margin-bottom: 20px;

}

.description {

    color: #34495e;

    line-height: 1.6;

    margin-bottom: 20px;

}

.comparison {

    background-color: #f8f9fa;

    padding: 15px;

    border-radius: 8px;

    margin: 10px 0;

}

.prompt-box {

    background-color: #ffffff;

    border: 2px solid #3498db;

    border-radius: 8px;

    padding: 15px;

    margin-bottom: 20px;

    box-shadow: 0 2px 4px rgba(0,0,0,0.1);

}

.prompt-box label {

    font-size: 1.1em;

    font-weight: bold;

    color: #2c3e50;

    margin-bottom: 10px;

    display: block;

}

.prompt-box textarea {

    width: 100%;

    min-height: 100px;

    padding: 10px;

    border: 1px solid #bdc3c7;

    border-radius: 4px;

    font-size: 1em;

    line-height: 1.5;

}

.output-box {

    background-color: #ffffff;

    border: 2px solid #2ecc71;

    border-radius: 8px;

    padding: 20px;

    margin-top: 20px;

    box-shadow: 0 2px 4px rgba(0,0,0,0.1);

}

.output-box label {

    font-size: 1.1em;

    font-weight: bold;

    color: #2c3e50;

    margin-bottom: 15px;

    display: block;

}

.output-box .markdown {

    background-color: #f8f9fa;

    padding: 15px;

    border-radius: 6px;

    border: 1px solid #e9ecef;

}

.output-box h3 {

    color: #2c3e50;

    border-bottom: 2px solid #3498db;

    padding-bottom: 8px;

    margin-top: 20px;

}

.output-box p {

    line-height: 1.6;

    color: #34495e;

    margin: 10px 0;

}

.loading {

    display: flex;

    align-items: center;

    justify-content: center;

    padding: 20px;

    background-color: #f8f9fa;

    border-radius: 8px;

    margin: 10px 0;

}

.loading-spinner {

    width: 40px;

    height: 40px;

    border: 4px solid #f3f3f3;

    border-top: 4px solid #3498db;

    border-radius: 50%;

    animation: spin 1s linear infinite;

    margin-right: 15px;

}

@keyframes spin {

    0% { transform: rotate(0deg); }

    100% { transform: rotate(360deg); }

}

.loading-text {

    color: #2c3e50;

    font-size: 1.1em;

    font-weight: 500;

}

"""

# Create the Gradio interface with enhanced UI
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """

        # Phi-2 Fine-tuned with GRPO and qLoRA

        This model has been fine-tuned using GRPO (Generative Reward-Penalized Optimization) and compressed using qLoRA.

        Try it out with different prompts and generation parameters!

        """,
        elem_classes="title"
    )
    
    with gr.Row():
        with gr.Column(scale=2):
            with gr.Column(elem_classes="prompt-box"):
                prompt = gr.Textbox(
                    label="Enter Your Prompt Here",
                    placeholder="Type your prompt here... (e.g., 'What is machine learning?' or 'Write a story about a robot learning to paint')",
                    lines=5,
                    show_label=True,
                )
            
            with gr.Row():
                with gr.Column():
                    max_length = gr.Slider(
                        minimum=32,
                        maximum=256,
                        value=128,
                        step=32,
                        label="Max Length",
                        info="Maximum number of tokens to generate"
                    )
                    temperature = gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.7,
                        step=0.1,
                        label="Temperature",
                        info="Higher values make output more random, lower values more deterministic"
                    )
                with gr.Column():
                    top_p = gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.9,
                        step=0.1,
                        label="Top-p",
                        info="Nucleus sampling parameter"
                    )
                    num_generations = gr.Slider(
                        minimum=1,
                        maximum=4,
                        value=2,
                        step=1,
                        label="Number of Generations",
                        info="Number of different responses to generate"
                    )
            
            with gr.Row():
                with gr.Column():
                    repetition_penalty = gr.Slider(
                        minimum=1.0,
                        maximum=2.0,
                        value=1.1,
                        step=0.1,
                        label="Repetition Penalty",
                        info="Higher values prevent repetition"
                    )
                with gr.Column():
                    do_sample = gr.Checkbox(
                        value=True,
                        label="Enable Sampling",
                        info="Enable/disable sampling for deterministic output"
                    )
            
            show_comparison = gr.Checkbox(
                value=True,
                label="Show Before/After Comparison",
                info="Toggle to show responses from both base and fine-tuned models"
            )
            
            generate_btn = gr.Button("Generate", variant="primary", size="large")
        
        with gr.Column(scale=3):
            with gr.Column(elem_classes="output-box"):
                output = gr.Markdown(
                    label="Generated Response(s)",
                    show_label=True,
                    value="Your generated responses will appear here...",  # Add default value
                )
                loading_status = gr.Markdown(
                    value="",
                    show_label=False,
                    elem_classes="loading"
                )
    
    gr.Markdown(
        """

        ### Example Prompts

        Try these example prompts to test the model:

        

        1. **Technical Questions**:

           - "What is machine learning?"

           - "What is deep learning?"

           - "What is the difference between supervised and unsupervised learning?"

        

        2. **Creative Writing**:

           - "Write a short story about a robot learning to paint."

           - "Write a story about a time-traveling smartphone."

           - "Write a fairy tale about a computer learning to dream."

           - "Create a story about an AI becoming an artist."

        

        3. **Technical Explanations**:

           - "How does neural network training work?"

           - "Explain quantum computing in simple terms."

           - "What is transfer learning?"

        

        4. **Creative Tasks**:

           - "Write a poem about artificial intelligence."

           - "Write a poem about the future of technology."

           - "Create a story about a robot learning to dream."

        """,
        elem_classes="description"
    )
    
    def generate_with_status(*args):
        # Show loading status
        loading_status.value = """

        <div class="loading">

            <div class="loading-spinner"></div>

            <div class="loading-text">Generating responses... Please wait...</div>

        </div>

        """
        # Generate response
        result = generate_response(*args)
        # Clear loading status
        loading_status.value = ""
        return result
    
    # Connect the interface
    generate_btn.click(
        fn=generate_with_status,
        inputs=[
            prompt,
            max_length,
            temperature,
            top_p,
            num_generations,
            repetition_penalty,
            do_sample,
            show_comparison
        ],
        outputs=output
    )

if __name__ == "__main__":
    console.print("[bold green]Starting Gradio interface...[/bold green]")
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True  # Enable sharing for HuggingFace Spaces
    )