Spaces:

TIGER-Lab
/

MAmmoTH2

Running on Zero

File size: 3,704 Bytes

import os
from threading import Thread
from typing import Iterator

import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

if torch.cuda.is_available():
    model_id = "TIGER-Lab/MAmmoTH2-8B-Plus"
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_id)

@spaces.GPU
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    system_prompt: str,
    max_new_tokens: int = 1024,
    temperature: float = 0.7,
    top_p: float = 1.0,
    repetition_penalty: float = 1.1,
    input_button: bool = False
) -> Iterator[str]:
    conversation = []
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})
    for user, assistant in chat_history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )

    outputs = []
    with torch.no_grad():
        model_outputs = model.generate(**generate_kwargs)
        for text in streamer.generate_from_iterator(model_outputs):
            outputs.append(text)
            yield "".join(outputs)

chat_interface = gr.Interface(
    fn=generate,
    inputs=[
        gr.Textbox(label="User Input", lines=5, placeholder="Enter your message..."),
        gr.Textbox(label="System Prompt", lines=5, placeholder="Enter system prompt (optional)..."),
        gr.Slider(
            label="Max New Tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.01,
            maximum=1.0,
            step=0.01,
            value=0.7,
        ),
        gr.Slider(
            label="Top-p (Nucleus Sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.01,
            value=1.0,
        ),
        gr.Slider(
            label="Repetition Penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.1,
        ),
        gr.Button("Generate Response")
    ],
    outputs=gr.Textbox(label="Chat Output", lines=10),
    title="🦣MAmmoTH2",
    description="A simple web interactive chat demo based on gradio.",
    examples=[
        ["Hello there! How are you doing?"],
        ["Can you explain briefly to me what is the Python programming language?"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
    theme="default",
    live=True,
)

chat_interface.launch()