mistral-vllm-chat / Dockerfile
enskaff's picture
Update Dockerfile
aa75ccb verified
# syntax=docker/dockerfile:1
# Use a more recent base image if possible (check vLLM compatibility)
FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
# Set essential environment variables and disable numba caching
ENV DEBIAN_FRONTEND=noninteractive \
PIP_NO_CACHE_DIR=off \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PYTHONIOENCODING=utf-8 \
PYTHONUNBUFFERED=1 \
HF_HOME=/app/huggingface_cache \
TRANSFORMERS_CACHE=/app/huggingface_cache \
NUMBA_DISABLE_CACHE=1 \
NUMBA_CACHE_DIR=/tmp/numba_cache
# Create the necessary cache directories and ensure they are writable
# Use the ENV variables for consistency
RUN mkdir -p ${HF_HOME} && chmod -R 777 ${HF_HOME} && \
mkdir -p ${NUMBA_CACHE_DIR} && chmod -R 777 ${NUMBA_CACHE_DIR}
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
git git-lfs python3 python3-pip curl ca-certificates build-essential && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Optionally, install Rust (needed for some tokenizers or building packages)
# Consider adding PATH update for Rust binaries if needed later
ENV PATH="/root/.cargo/bin:${PATH}"
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
WORKDIR /app
# Copy requirements file and install Python dependencies
COPY requirements.txt .
# Consider upgrading vllm here if the error persists: pip3 install --no-cache-dir --upgrade vllm
RUN pip3 install --no-cache-dir --upgrade pip && \
pip3 install --no-cache-dir -r requirements.txt
# Use BuildKit secret mount to securely inject HF_TOKEN during build.
# Ensure the downloaded model exists and is accessible
RUN --mount=type=secret,id=HF_TOKEN,env=HF_TOKEN,mode=0444,required=true \
python3 -c "import os; \
from huggingface_hub import snapshot_download; \
token = os.environ.get('HF_TOKEN'); \
assert token, 'HF_TOKEN is not set!'; \
print('Token is set. Downloading model...'); \
snapshot_download(repo_id='mistralai/Mistral-7B-Instruct-v0.1', local_dir='/app/model', token=token, local_dir_use_symlinks=False)" \
&& ls -l /app/model # Add a check to see if download worked
# Expose the port for the API server
EXPOSE 8000
# Healthcheck to verify API server is responding
HEALTHCHECK --interval=20s --timeout=10s --start-period=120s --retries=3 \
CMD curl --fail http://localhost:8000/health || exit 1
# Launch the vLLM OpenAI-style API server, pointing to the downloaded model directory.
CMD ["python3", "-m", "vllm.entrypoints.openai.api_server", "--model", "/app/model"]