Spaces:

Tonic
/

1-58-Bit_LLM

Build error

App Files Files Community

Tonic commited on 11 days ago

Commit

9fadf46

unverified ·

1 Parent(s): 5760858

using docker initial commit

Browse files

Files changed (5) hide show

.gitignore +0 -2
Dockerfile +45 -0
app.py +64 -48
apt.txt +5 -0
requirements.txt +3 -0

.gitignore DELETED Viewed

	@@ -1,2 +0,0 @@
1	- ex1.py
2	- ex2.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+FROM ubuntu:22.04
+# Set non-interactive frontend for apt
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    clang-18 \
+    git \
+    python3.9 \
+    python3-pip \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+# Set Clang as default compiler
+ENV CC=/usr/bin/clang-18
+ENV CXX=/usr/bin/clang++-18
+# Create a non-root user (Hugging Face Spaces run as non-root)
+RUN useradd -m -u 1000 user
+USER user
+WORKDIR /home/user/app
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Clone and build bitnet.cpp
+RUN git clone https://github.com/microsoft/BitNet.git bitnet.cpp
+WORKDIR /home/user/app/bitnet.cpp
+RUN mkdir build && cd build && \
+    cmake .. -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 && \
+    make -j$(nproc)
+# Copy application code
+WORKDIR /home/user/app
+COPY app.py .
+# Expose port 7860 (required for Hugging Face Spaces)
+EXPOSE 7860
+# Run Gradio app
+CMD ["python3", "app.py"]

app.py CHANGED Viewed

@@ -1,69 +1,87 @@
 import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-# Singleton for model and tokenizer
-_model = None
-_tokenizer = None
-def load_model():
-    global _model, _tokenizer
-    if _model is None or _tokenizer is None:
-        model_id = "microsoft/bitnet-b1.58-2B-4T"
-        _tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            trust_remote_code=True
-        )
-        config = AutoConfig.from_pretrained(
-            model_id,
-            trust_remote_code=True
-        )
-        _model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            config=config,
-            torch_dtype=torch.bfloat16,
-            trust_remote_code=True
         )
-    return _model, _tokenizer
 def manage_history(history):
-    # Limit to 3 turns (each turn is user + assistant = 2 messages)
-    max_messages = 6  # 3 turns * 2 messages per turn
     if len(history) > max_messages:
         history = history[-max_messages:]
     # Limit total character count to 300
     total_chars = sum(len(msg["content"]) for msg in history)
     while total_chars > 300 and history:
-        history.pop(0)  # Remove oldest message
-        total_chars = sum(len(msg["content"]) for msg in history)
     return history
 def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
-    model, tokenizer = load_model()
-    messages = [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": user_input},
-    ]
-    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)
-    # Generate response
-    chat_outputs = model.generate(
-        **chat_input,
-        max_new_tokens=max_new_tokens,
         temperature=temperature,
         top_p=top_p,
-        top_k=top_k,
-        do_sample=True
     )
-    # Decode response
-    response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True)
     # Update history
     history.append({"role": "user", "content": user_input})
     history.append({"role": "assistant", "content": response})
@@ -155,6 +173,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     )
 if __name__ == "__main__":
-    # Preload model to avoid threading issues
-    load_model()
-    demo.launch(ssr_mode=False, share=True)

 import gradio as gr
+import subprocess
+import os
+import json
+from huggingface_hub import hf_hub_download
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Paths
+BITNET_BINARY = "/home/user/app/bitnet.cpp/build/bin/main"
+MODEL_DIR = "/home/user/app/models"
+MODEL_PATH = os.path.join(MODEL_DIR, "bitnet-b1.58-2B-4T.gguf")
+MODEL_REPO = "microsoft/bitnet-b1.58-2B-4T-gguf"
+MODEL_FILE = "bitnet-b1.58-2B-4T.gguf"
+# Download model weights if not present
+def download_model():
+    if not os.path.exists(MODEL_PATH):
+        logger.info("Downloading model weights...")
+        os.makedirs(MODEL_DIR, exist_ok=True)
+        hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=MODEL_FILE,
+            local_dir=MODEL_DIR,
+            local_dir_use_symlinks=False
         )
+        logger.info("Model weights downloaded successfully.")
+    else:
+        logger.info("Model weights already exist.")
+# Run model download on startup
+download_model()
+def run_bitnet_inference(prompt, max_tokens=50, temperature=0.7, top_p=0.9, top_k=50):
+    # Prepare the command to call bitnet.cpp binary
+    cmd = [
+        BITNET_BINARY,
+        "-m", MODEL_PATH,
+        "-p", prompt,
+        "--max-tokens", str(max_tokens),
+        "--temperature", str(temperature),
+        "--top-p", str(top_p),
+        "--top-k", str(top_k)
+    ]
+    try:
+        # Run the command and capture output
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        output = result.stdout.strip()
+        return output
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Inference error: {e.stderr}")
+        return f"Error during inference: {e.stderr}"
 def manage_history(history):
+    # Limit to 3 turns (user + assistant = 2 messages per turn)
+    max_messages = 6
     if len(history) > max_messages:
         history = history[-max_messages:]
     # Limit total character count to 300
     total_chars = sum(len(msg["content"]) for msg in history)
     while total_chars > 300 and history:
+        history.pop(0)
+        total_chars = sum(len(msg["content"]) for msg in dobrohistory)
     return history
 def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
+    # Format the prompt for bitnet.cpp
+    full_prompt = f"{system_prompt}\n\nUser: {user_input}\nAssistant: "
+    # Run inference
+    response = run_bitnet_inference(
+        full_prompt,
+        max_tokens=max_new_tokens,
         temperature=temperature,
         top_p=top_p,
+        top_k=top_k
     )
     # Update history
     history.append({"role": "user", "content": user_input})
     history.append({"role": "assistant", "content": response})
     )
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False, share=True)

apt.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+build-essential
+cmake
+clang-18
+git
+wget

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 torch
 git+https://github.com/shumingma/transformers.git
 accelerate

 torch
 git+https://github.com/shumingma/transformers.git
 accelerate
+gradio
+numpy
+huggingface_hub