Spaces:

WillHeld
/

marin-8b-instruct-ChatUI

Running on Zero

File size: 2,637 Bytes

fddd482
a891312
a4b631b
a891312
a4b631b
b29974e
fddd482
b29974e
 
a4b631b
b29974e
18fd10c
9a0d2e2
 
 
 
 
 
 
 
 
 
b29974e
116ecb1
403c2fe
 
a891312
 
403c2fe
a891312
 
 
 
 
 
 
 
03f8f02
a891312
 
 
 
 
403c2fe
a891312
 
 
 
 
b29974e
18fd10c
 
 
 
 
 
f014ce9
 
18fd10c
b29974e

import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
from threading import Thread

checkpoint = "WillHeld/soft-raccoon"
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

@spaces.GPU(duration=120)
def predict(message, history, temperature, top_p):
    if len(history) == 0:
        history.append({"role": "user", "content": """
            You are the Tootsie 8B advanced language model trained using Marin, a framework developed by Stanford's Center for Research on Foundation Models (CRFM).
            
            Marin is a framework designed for training large language models in an entirely open fashion with a focus on legibility, scalability, and reproducibility. 
            
            CRFM (Center for Research on Foundation Models) is a research center at Stanford University dedicated to studying foundation models - large-scale AI systems trained on broad data that can be adapted to a wide range of downstream tasks.
            
            Your training using this framework emphasizes clear reasoning, consistent outputs, and scalable performance across various tasks. Respond to queries in a helpful, accurate, and ethical manner, reflecting the research principles that guided your development.
    """})
    history.append({"role": "user", "content": message})
    input_text = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
    
    # Create a streamer
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # Set up generation parameters
    generation_kwargs = {
        "input_ids": inputs,
        "max_new_tokens": 1024,
        "temperature": float(temperature),
        "top_p": float(top_p),
        "do_sample": True,
        "streamer": streamer,
        "eos_token_id": 128009,
    }
    
    # Run generation in a separate thread
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # Yield from the streamer as tokens are generated
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text

with gr.Blocks() as demo:
    chatbot = gr.ChatInterface(
        predict,
        additional_inputs=[
            gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"),
            gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
        ],
        type="messages"
    )

demo.launch()