Spaces:

keeperballon
/

multi-llm

Running

File size: 5,785 Bytes

import gradio as gr
from openai import OpenAI
import os
from datetime import datetime

# App title and description
APP_TITLE = "NO GPU, Multi LLMs Uses"
APP_DESCRIPTION = "Access and chat with multiple language models without requiring a GPU"

# Load environment variables
ACCESS_TOKEN = os.getenv("HF_TOKEN")
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)

# Model categories for better organization
MODEL_CATEGORIES = {
    "Qwen": [
        "Qwen/Qwen2.5-72B-Instruct",
        "Qwen/Qwen2.5-3B-Instruct",
        "Qwen/Qwen2.5-0.5B-Instruct",
        "Qwen/Qwen2.5-Coder-32B-Instruct",
    ],
    "Meta LLaMa": [
        "meta-llama/Llama-3.3-70B-Instruct",
        "meta-llama/Llama-3.1-70B-Instruct",
        "meta-llama/Llama-3.0-70B-Instruct",
        "meta-llama/Llama-3.2-3B-Instruct",
        "meta-llama/Llama-3.2-1B-Instruct",
        "meta-llama/Llama-3.1-8B-Instruct",
    ],
    "Mistral": [
        "mistralai/Mistral-Nemo-Instruct-2407",
        "mistralai/Mixtral-8x7B-Instruct-v0.1",
        "mistralai/Mistral-7B-Instruct-v0.3",
        "mistralai/Mistral-7B-Instruct-v0.2",
    ],
    "Microsoft Phi": [
        "microsoft/Phi-3.5-mini-instruct",
        "microsoft/Phi-3-mini-128k-instruct",
        "microsoft/Phi-3-mini-4k-instruct",
    ],
    "Other Models": [

        "NousResearch/Hermes-3-Llama-3.1-8B",
        "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "HuggingFaceH4/zephyr-7b-beta",
        "HuggingFaceTB/SmolLM2-360M-Instruct",
        "tiiuae/falcon-7b-instruct",
        "01-ai/Yi-1.5-34B-Chat",
    ]
}

# Flatten the model list
ALL_MODELS = [m for models in MODEL_CATEGORIES.values() for m in models]

def get_model_info(model_name):
    parts = model_name.split('/')
    if len(parts) != 2:
        return f"**Model:** {model_name}\n**Format:** Unknown"
    org, model = parts
    import re
    size_match = re.search(r'(\d+\.?\d*)B', model)
    size = size_match.group(1) + "B" if size_match else "Unknown"
    return f"**Organization:** {org}\n**Model:** {model}\n**Size:** {size}"

def respond(
    message,
    history,
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    selected_model
):
    # Prepare messages
    if seed == -1:
        seed = None
    messages = [{"role": "system", "content": system_message}]
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})

    model_to_use = selected_model or ALL_MODELS[0]

    new_history = list(history) + [(message, "")]
    current_response = ""
    try:
        for chunk in client.chat.completions.create(
            model=model_to_use,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            seed=seed,
            messages=messages,
        ):
            delta = chunk.choices[0].delta.content
            if delta:
                current_response += delta
                new_history[-1] = (message, current_response)
                yield new_history
    except Exception as e:
        err = f"Error: {e}"
        new_history[-1] = (message, err)
        yield new_history

with gr.Blocks(title=APP_TITLE, theme=gr.themes.Soft()) as demo:
    gr.Markdown(f"## {APP_TITLE}\n\n{APP_DESCRIPTION}")

    with gr.Row():
        with gr.Column(scale=2):
            # Model selection via Dropdown
            selected_model = gr.Dropdown(
                choices=ALL_MODELS,
                value=ALL_MODELS[0],
                label="Select Model"
            )
            model_info = gr.Markdown(get_model_info(ALL_MODELS[0]))

            def update_info(model_name):
                return get_model_info(model_name)
            selected_model.change(
                fn=update_info,
                inputs=[selected_model],
                outputs=[model_info]
            )

            # Conversation settings
            system_message = gr.Textbox(
                value="You are a helpful assistant.",
                label="System Prompt",
                lines=2
            )

            max_tokens = gr.Slider(1, 4096, value=512, label="Max New Tokens")
            temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature")
            top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")
            freq_penalty = gr.Slider(-2.0, 2.0, value=0.0, step=0.1, label="Frequency Penalty")
            seed = gr.Slider(-1, 65535, value=-1, step=1, label="Seed (-1 random)")

        with gr.Column(scale=3):
            chatbot = gr.Chatbot()
            msg = gr.Textbox(placeholder="Type your message here...", show_label=False)
            send_btn = gr.Button("Send")

            send_btn.click(
                fn=respond,
                inputs=[
                    msg, chatbot, system_message,
                    max_tokens, temperature, top_p,
                    freq_penalty, seed, selected_model
                ],
                outputs=[chatbot],
                queue=True
            )
            msg.submit(
                fn=respond,
                inputs=[
                    msg, chatbot, system_message,
                    max_tokens, temperature, top_p,
                    freq_penalty, seed, selected_model
                ],
                outputs=[chatbot],
                queue=True
            )

    demo.launch()