GenieHive/scripts/test_qwen35_server_cuda_con...

35 lines
984 B
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
IMAGE="${IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
MODEL_PATH="${MODEL_PATH:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
GPU_INDEX="${GPU_INDEX:-0}"
CTX_SIZE="${CTX_SIZE:-512}"
PORT="${PORT:-19091}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-90}"
if [[ ! -f "${MODEL_PATH}" ]]; then
echo "Model not found: ${MODEL_PATH}" >&2
exit 1
fi
echo "Image: ${IMAGE}"
echo "Model: ${MODEL_PATH}"
echo "GPU: ${GPU_INDEX}"
echo "Port: ${PORT}"
echo "Timeout: ${TIMEOUT_SECONDS}s"
echo
echo "This probe is successful if llama-server loads the model and begins serving."
echo "A timeout exit after successful startup is acceptable for this test."
echo
timeout "${TIMEOUT_SECONDS}"s docker run --rm --gpus all \
-e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
-v "$(dirname "${MODEL_PATH}"):/models:ro" \
"${IMAGE}" \
-m "/models/$(basename "${MODEL_PATH}")" \
-ngl 999 \
--ctx-size "${CTX_SIZE}" \
--host 127.0.0.1 \
--port "${PORT}"