Spaces:
Sleeping
Sleeping
import gradio as gr | |
from openai import OpenAI | |
import uuid | |
import json | |
import os | |
import tempfile | |
import subprocess | |
import threading | |
BASE_URL = "http://localhost:8080/v1" | |
MODEL_NAME = "bn" | |
def read_output(process): | |
"""Reads the output from the subprocess and prints it to the console.""" | |
for line in iter(process.stdout.readline, ""): | |
print(line.rstrip()) | |
process.stdout.close() | |
def start_server(command): | |
"""Starts the server as a subprocess and captures its stdout.""" | |
# Start the server process | |
process = subprocess.Popen( | |
command, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT, # Redirect stderr to stdout | |
text=True # Automatically decode the output to text | |
) | |
# Start a thread to read the output | |
output_thread = threading.Thread(target=read_output, args=(process,)) | |
output_thread.daemon = True # Daemonize the thread so it exits when the main program does | |
output_thread.start() | |
return process | |
server_process = start_server(["./ik_llama.cpp/build/bin/llama-server", "-m" ,"./ik_llama.cpp/build/model-out.gguf", "--chat-template", "vicuna"]) | |
cli = OpenAI(api_key="sk-nokey", base_url=BASE_URL) | |
def openai_call(message, history, system_prompt, max_new_tokens): | |
#print(history) # DEBUG | |
history.insert(0, { | |
"role": "system", | |
"content": system_prompt | |
}) | |
history.append({ | |
"role": "user", | |
"content": message | |
}) | |
response = cli.chat.completions.create( | |
model=MODEL_NAME, | |
messages=history, | |
max_tokens=max_new_tokens, | |
stop=["<|im_end|>", "</s>"], | |
stream=True | |
) | |
reply = "" | |
for chunk in response: | |
delta = chunk.choices[0].delta.content | |
if delta is not None: | |
reply = reply + delta | |
yield reply, None | |
history.append({ "role": "assistant", "content": reply }) | |
yield reply, gr.State(history) | |
def gen_file(conv_state): | |
#print(conv_state) # DEBUG | |
fname = f"{str(uuid.uuid4())}.json" | |
#with tempfile.NamedTemporaryFile(prefix=str(uuid.uuid4()), suffix=".json", mode="w", encoding="utf-8", delete_on_close=False) as f: | |
with open(fname, mode="w", encoding="utf-8") as f: | |
json.dump(conv_state.value, f, indent=4, ensure_ascii=False) | |
return gr.File(fname), gr.State(fname) | |
def rm_file_wrap(path : str): | |
# Try to delete the file. | |
try: | |
os.remove(path) | |
except OSError as e: | |
# If it fails, inform the user. | |
print("Error: %s - %s." % (e.filename, e.strerror)) | |
def on_download(download_data: gr.DownloadData): | |
print(f"deleting {download_data.file.path}") | |
rm_file_wrap(download_data.file.path) | |
def clean_file(orig_path): | |
print(f"Deleting {orig_path.value}") | |
rm_file_wrap(orig_path.value) | |
with gr.Blocks() as demo: | |
#download=gr.DownloadButton(label="Download Conversation", value=None) | |
conv_state = gr.State() | |
orig_path = gr.State() | |
chat = gr.ChatInterface( | |
openai_call, | |
type="messages", | |
additional_inputs=[ | |
gr.Textbox("You are a helpful AI assistant.", label="System Prompt"), | |
gr.Slider(30, 2048, label="Max new tokens"), | |
], | |
additional_outputs=[conv_state], | |
title="Chat with bitnet using ik_llama", | |
description="Warning: Do not input sensitive info - assume everything is public! Also note this is experimental and ik_llama server doesn't seems to support arbitrary chat template, we're using vicuna as approximate match - so there might be intelligence degradation." | |
) | |
download_file = gr.File() | |
download_btn = gr.Button("Export Conversation for Download") \ | |
.click(fn=gen_file, inputs=[conv_state], outputs=[download_file, orig_path]) \ | |
.success(fn=clean_file, inputs=[orig_path]) | |
download_file.download(on_download, None, None) | |
try: | |
demo.queue(max_size=10, api_open=True).launch(server_name='0.0.0.0') | |
finally: | |
# Stop the server | |
server_process.terminate() | |
server_process.wait() | |
print("Server stopped.") | |