bitnet-with-ik_llama / chat_demo.py
lemonteaa's picture
Update chat_demo.py
a7e66dd verified
import gradio as gr
from openai import OpenAI
import uuid
import json
import os
import tempfile
import subprocess
import threading
BASE_URL = "http://localhost:8080/v1"
MODEL_NAME = "bn"
def read_output(process):
"""Reads the output from the subprocess and prints it to the console."""
for line in iter(process.stdout.readline, ""):
print(line.rstrip())
process.stdout.close()
def start_server(command):
"""Starts the server as a subprocess and captures its stdout."""
# Start the server process
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, # Redirect stderr to stdout
text=True # Automatically decode the output to text
)
# Start a thread to read the output
output_thread = threading.Thread(target=read_output, args=(process,))
output_thread.daemon = True # Daemonize the thread so it exits when the main program does
output_thread.start()
return process
server_process = start_server(["./ik_llama.cpp/build/bin/llama-server", "-m" ,"./ik_llama.cpp/build/model-out.gguf", "--chat-template", "vicuna"])
cli = OpenAI(api_key="sk-nokey", base_url=BASE_URL)
def openai_call(message, history, system_prompt, max_new_tokens):
#print(history) # DEBUG
history.insert(0, {
"role": "system",
"content": system_prompt
})
history.append({
"role": "user",
"content": message
})
response = cli.chat.completions.create(
model=MODEL_NAME,
messages=history,
max_tokens=max_new_tokens,
stop=["<|im_end|>", "</s>"],
stream=True
)
reply = ""
for chunk in response:
delta = chunk.choices[0].delta.content
if delta is not None:
reply = reply + delta
yield reply, None
history.append({ "role": "assistant", "content": reply })
yield reply, gr.State(history)
def gen_file(conv_state):
#print(conv_state) # DEBUG
fname = f"{str(uuid.uuid4())}.json"
#with tempfile.NamedTemporaryFile(prefix=str(uuid.uuid4()), suffix=".json", mode="w", encoding="utf-8", delete_on_close=False) as f:
with open(fname, mode="w", encoding="utf-8") as f:
json.dump(conv_state.value, f, indent=4, ensure_ascii=False)
return gr.File(fname), gr.State(fname)
def rm_file_wrap(path : str):
# Try to delete the file.
try:
os.remove(path)
except OSError as e:
# If it fails, inform the user.
print("Error: %s - %s." % (e.filename, e.strerror))
def on_download(download_data: gr.DownloadData):
print(f"deleting {download_data.file.path}")
rm_file_wrap(download_data.file.path)
def clean_file(orig_path):
print(f"Deleting {orig_path.value}")
rm_file_wrap(orig_path.value)
with gr.Blocks() as demo:
#download=gr.DownloadButton(label="Download Conversation", value=None)
conv_state = gr.State()
orig_path = gr.State()
chat = gr.ChatInterface(
openai_call,
type="messages",
additional_inputs=[
gr.Textbox("You are a helpful AI assistant.", label="System Prompt"),
gr.Slider(30, 2048, label="Max new tokens"),
],
additional_outputs=[conv_state],
title="Chat with bitnet using ik_llama",
description="Warning: Do not input sensitive info - assume everything is public! Also note this is experimental and ik_llama server doesn't seems to support arbitrary chat template, we're using vicuna as approximate match - so there might be intelligence degradation."
)
download_file = gr.File()
download_btn = gr.Button("Export Conversation for Download") \
.click(fn=gen_file, inputs=[conv_state], outputs=[download_file, orig_path]) \
.success(fn=clean_file, inputs=[orig_path])
download_file.download(on_download, None, None)
try:
demo.queue(max_size=10, api_open=True).launch(server_name='0.0.0.0')
finally:
# Stop the server
server_process.terminate()
server_process.wait()
print("Server stopped.")