deploy-vllm2 / entrypoint.sh
Damien Benveniste
modified
10dd1af
raw
history blame
1.67 kB
#!/bin/bash
# Default values
# MODEL=${MODEL:-"microsoft/Phi-3-mini-4k-instruct"}
MODEL=${MODEL:-"EleutherAI/pythia-70m"}
DTYPE=${DTYPE:-"half"}
MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-512}
MAX_NUM_SEQS=${MAX_NUM_SEQS:-16}
GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.85}
MAX_MODEL_LEN=${MAX_MODEL_LEN:-512}
ENFORCE_EAGER=${ENFORCE_EAGER:-true}
# Disable usage stats via environment variable
export VLLM_DISABLE_USAGE_STATS=true
# Print environment for debugging
echo "Environment variables:"
env
# Create and set permissions for the config directory
CONFIG_DIR=${XDG_CONFIG_HOME:-"/tmp/config"}
if [ ! -d "$CONFIG_DIR" ]; then
mkdir -p "$CONFIG_DIR"
fi
chmod -R 777 "$CONFIG_DIR"
echo "Permissions for $CONFIG_DIR:"
ls -la "$CONFIG_DIR"
# Check and set permissions for directories
for dir in /tmp/huggingface /tmp/cache /tmp/numba_cache /tmp/outlines_cache /.config; do
if [ ! -d "$dir" ]; then
mkdir -p "$dir"
fi
chmod -R 777 "$dir"
echo "Permissions for $dir:"
ls -la "$dir"
done
# # Construct the command
# CMD="vllm serve $MODEL \
# --host 0.0.0.0 \
# --port 8000 \
# --dtype $DTYPE \
# --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
# --max-num-seqs $MAX_NUM_SEQS \
# --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
# --max-model-len $MAX_MODEL_LEN"
# # Add enforce-eager only if it's set to true
# if [ "$ENFORCE_EAGER" = "true" ]; then
# CMD="$CMD --enforce-eager"
# fi
CMD="python3 -m vllm.entrypoints.openai.api_server \
--model EleutherAI/pythia-70m \
--gpu-memory-utilization 0.9
--max-model-len 200"
# Execute the command
echo "Running command: $CMD"
exec $CMD