File size: 2,559 Bytes
8ced1ba
9cb790e
aa75ccb
1b85b71
6f2c834
1b85b71
 
 
 
6f2c834
5a60ff4
84c5bef
 
 
799283a
9cb790e
 
 
 
6f2c834
 
336ec47
3d5ae27
 
c392d1c
6f2c834
9cb790e
 
336ec47
b56af87
336ec47
6f2c834
 
336ec47
9cb790e
3d5ae27
 
25bcaf8
6f2c834
9cb790e
2d5cc5e
8ced1ba
 
6f2c834
8ced1ba
2d5cc5e
9cb790e
 
8ced1ba
6f2c834
8ced1ba
 
6f2c834
8ced1ba
 
 
6f2c834
9cb790e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# syntax=docker/dockerfile:1
# Use a more recent base image if possible (check vLLM compatibility)
FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04

# Set essential environment variables and disable numba caching
ENV DEBIAN_FRONTEND=noninteractive \
    PIP_NO_CACHE_DIR=off \
    PIP_DISABLE_PIP_VERSION_CHECK=on \
    PYTHONIOENCODING=utf-8 \
    PYTHONUNBUFFERED=1 \
    HF_HOME=/app/huggingface_cache \
    TRANSFORMERS_CACHE=/app/huggingface_cache \
    NUMBA_DISABLE_CACHE=1 \
    NUMBA_CACHE_DIR=/tmp/numba_cache

# Create the necessary cache directories and ensure they are writable
# Use the ENV variables for consistency
RUN mkdir -p ${HF_HOME} && chmod -R 777 ${HF_HOME} && \
    mkdir -p ${NUMBA_CACHE_DIR} && chmod -R 777 ${NUMBA_CACHE_DIR}

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    git git-lfs python3 python3-pip curl ca-certificates build-essential && \
    apt-get clean && rm -rf /var/lib/apt/lists/*

# Optionally, install Rust (needed for some tokenizers or building packages)
# Consider adding PATH update for Rust binaries if needed later
ENV PATH="/root/.cargo/bin:${PATH}"
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable

WORKDIR /app

# Copy requirements file and install Python dependencies
COPY requirements.txt .
# Consider upgrading vllm here if the error persists: pip3 install --no-cache-dir --upgrade vllm
RUN pip3 install --no-cache-dir --upgrade pip && \
    pip3 install --no-cache-dir -r requirements.txt

# Use BuildKit secret mount to securely inject HF_TOKEN during build.
# Ensure the downloaded model exists and is accessible
RUN --mount=type=secret,id=HF_TOKEN,env=HF_TOKEN,mode=0444,required=true \
    python3 -c "import os; \
from huggingface_hub import snapshot_download; \
token = os.environ.get('HF_TOKEN'); \
assert token, 'HF_TOKEN is not set!'; \
print('Token is set. Downloading model...'); \
snapshot_download(repo_id='mistralai/Mistral-7B-Instruct-v0.1', local_dir='/app/model', token=token, local_dir_use_symlinks=False)" \
    && ls -l /app/model # Add a check to see if download worked

# Expose the port for the API server
EXPOSE 8000

# Healthcheck to verify API server is responding
HEALTHCHECK --interval=20s --timeout=10s --start-period=120s --retries=3 \
    CMD curl --fail http://localhost:8000/health || exit 1

# Launch the vLLM OpenAI-style API server, pointing to the downloaded model directory.
CMD ["python3", "-m", "vllm.entrypoints.openai.api_server", "--model", "/app/model"]