Tonic commited on
Commit
9fadf46
·
unverified ·
1 Parent(s): 5760858

using docker initial commit

Browse files
Files changed (5) hide show
  1. .gitignore +0 -2
  2. Dockerfile +45 -0
  3. app.py +64 -48
  4. apt.txt +5 -0
  5. requirements.txt +3 -0
.gitignore DELETED
@@ -1,2 +0,0 @@
1
- ex1.py
2
- ex2.py
 
 
 
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04
2
+
3
+ # Set non-interactive frontend for apt
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ cmake \
10
+ clang-18 \
11
+ git \
12
+ python3.9 \
13
+ python3-pip \
14
+ wget \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Set Clang as default compiler
18
+ ENV CC=/usr/bin/clang-18
19
+ ENV CXX=/usr/bin/clang++-18
20
+
21
+ # Create a non-root user (Hugging Face Spaces run as non-root)
22
+ RUN useradd -m -u 1000 user
23
+ USER user
24
+ WORKDIR /home/user/app
25
+
26
+ # Install Python dependencies
27
+ COPY requirements.txt .
28
+ RUN pip3 install --no-cache-dir -r requirements.txt
29
+
30
+ # Clone and build bitnet.cpp
31
+ RUN git clone https://github.com/microsoft/BitNet.git bitnet.cpp
32
+ WORKDIR /home/user/app/bitnet.cpp
33
+ RUN mkdir build && cd build && \
34
+ cmake .. -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 && \
35
+ make -j$(nproc)
36
+
37
+ # Copy application code
38
+ WORKDIR /home/user/app
39
+ COPY app.py .
40
+
41
+ # Expose port 7860 (required for Hugging Face Spaces)
42
+ EXPOSE 7860
43
+
44
+ # Run Gradio app
45
+ CMD ["python3", "app.py"]
app.py CHANGED
@@ -1,69 +1,87 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 
 
 
4
 
5
- # Singleton for model and tokenizer
6
- _model = None
7
- _tokenizer = None
8
 
9
- def load_model():
10
- global _model, _tokenizer
11
- if _model is None or _tokenizer is None:
12
- model_id = "microsoft/bitnet-b1.58-2B-4T"
13
- _tokenizer = AutoTokenizer.from_pretrained(
14
- model_id,
15
- trust_remote_code=True
16
- )
17
- config = AutoConfig.from_pretrained(
18
- model_id,
19
- trust_remote_code=True
20
- )
21
- _model = AutoModelForCausalLM.from_pretrained(
22
- model_id,
23
- config=config,
24
- torch_dtype=torch.bfloat16,
25
- trust_remote_code=True
26
  )
27
- return _model, _tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def manage_history(history):
30
- # Limit to 3 turns (each turn is user + assistant = 2 messages)
31
- max_messages = 6 # 3 turns * 2 messages per turn
32
  if len(history) > max_messages:
33
  history = history[-max_messages:]
34
 
35
  # Limit total character count to 300
36
  total_chars = sum(len(msg["content"]) for msg in history)
37
  while total_chars > 300 and history:
38
- history.pop(0) # Remove oldest message
39
- total_chars = sum(len(msg["content"]) for msg in history)
40
 
41
  return history
42
 
43
  def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
44
- model, tokenizer = load_model()
45
-
46
- messages = [
47
- {"role": "system", "content": system_prompt},
48
- {"role": "user", "content": user_input},
49
- ]
50
 
51
- prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
52
- chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)
53
-
54
- # Generate response
55
- chat_outputs = model.generate(
56
- **chat_input,
57
- max_new_tokens=max_new_tokens,
58
  temperature=temperature,
59
  top_p=top_p,
60
- top_k=top_k,
61
- do_sample=True
62
  )
63
 
64
- # Decode response
65
- response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True)
66
-
67
  # Update history
68
  history.append({"role": "user", "content": user_input})
69
  history.append({"role": "assistant", "content": response})
@@ -155,6 +173,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
155
  )
156
 
157
  if __name__ == "__main__":
158
- # Preload model to avoid threading issues
159
- load_model()
160
- demo.launch(ssr_mode=False, share=True)
 
1
  import gradio as gr
2
+ import subprocess
3
+ import os
4
+ import json
5
+ from huggingface_hub import hf_hub_download
6
+ import logging
7
 
8
+ # Setup logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
 
12
+ # Paths
13
+ BITNET_BINARY = "/home/user/app/bitnet.cpp/build/bin/main"
14
+ MODEL_DIR = "/home/user/app/models"
15
+ MODEL_PATH = os.path.join(MODEL_DIR, "bitnet-b1.58-2B-4T.gguf")
16
+ MODEL_REPO = "microsoft/bitnet-b1.58-2B-4T-gguf"
17
+ MODEL_FILE = "bitnet-b1.58-2B-4T.gguf"
18
+
19
+ # Download model weights if not present
20
+ def download_model():
21
+ if not os.path.exists(MODEL_PATH):
22
+ logger.info("Downloading model weights...")
23
+ os.makedirs(MODEL_DIR, exist_ok=True)
24
+ hf_hub_download(
25
+ repo_id=MODEL_REPO,
26
+ filename=MODEL_FILE,
27
+ local_dir=MODEL_DIR,
28
+ local_dir_use_symlinks=False
29
  )
30
+ logger.info("Model weights downloaded successfully.")
31
+ else:
32
+ logger.info("Model weights already exist.")
33
+
34
+ # Run model download on startup
35
+ download_model()
36
+
37
+ def run_bitnet_inference(prompt, max_tokens=50, temperature=0.7, top_p=0.9, top_k=50):
38
+ # Prepare the command to call bitnet.cpp binary
39
+ cmd = [
40
+ BITNET_BINARY,
41
+ "-m", MODEL_PATH,
42
+ "-p", prompt,
43
+ "--max-tokens", str(max_tokens),
44
+ "--temperature", str(temperature),
45
+ "--top-p", str(top_p),
46
+ "--top-k", str(top_k)
47
+ ]
48
+
49
+ try:
50
+ # Run the command and capture output
51
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
52
+ output = result.stdout.strip()
53
+ return output
54
+ except subprocess.CalledProcessError as e:
55
+ logger.error(f"Inference error: {e.stderr}")
56
+ return f"Error during inference: {e.stderr}"
57
 
58
  def manage_history(history):
59
+ # Limit to 3 turns (user + assistant = 2 messages per turn)
60
+ max_messages = 6
61
  if len(history) > max_messages:
62
  history = history[-max_messages:]
63
 
64
  # Limit total character count to 300
65
  total_chars = sum(len(msg["content"]) for msg in history)
66
  while total_chars > 300 and history:
67
+ history.pop(0)
68
+ total_chars = sum(len(msg["content"]) for msg in dobrohistory)
69
 
70
  return history
71
 
72
  def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
73
+ # Format the prompt for bitnet.cpp
74
+ full_prompt = f"{system_prompt}\n\nUser: {user_input}\nAssistant: "
 
 
 
 
75
 
76
+ # Run inference
77
+ response = run_bitnet_inference(
78
+ full_prompt,
79
+ max_tokens=max_new_tokens,
 
 
 
80
  temperature=temperature,
81
  top_p=top_p,
82
+ top_k=top_k
 
83
  )
84
 
 
 
 
85
  # Update history
86
  history.append({"role": "user", "content": user_input})
87
  history.append({"role": "assistant", "content": response})
 
173
  )
174
 
175
  if __name__ == "__main__":
176
+ demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False, share=True)
 
 
apt.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ build-essential
2
+ cmake
3
+ clang-18
4
+ git
5
+ wget
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  torch
2
  git+https://github.com/shumingma/transformers.git
3
  accelerate
 
 
 
 
1
  torch
2
  git+https://github.com/shumingma/transformers.git
3
  accelerate
4
+ gradio
5
+ numpy
6
+ huggingface_hub