GenieHive/scripts/p40_triple_gpu1.sh

33 lines
1018 B
Bash

#!/usr/bin/env bash
set -euo pipefail
MODEL_GPU1="${MODEL_GPU1:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
HOST="${GPU1_HOST:-127.0.0.1}"
PORT="${GPU1_PORT:-18092}"
CTX_SIZE="${GPU1_CTX_SIZE:-4096}"
NGL="${GPU1_NGL:-999}"
GPU_INDEX="${GPU1_INDEX:-1}"
USE_CONTAINER="${GPU1_USE_CONTAINER:-0}"
CONTAINER_IMAGE="${GPU1_CONTAINER_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
if [[ "${USE_CONTAINER}" == "1" ]]; then
exec docker run --rm --gpus all \
--network host \
-e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
-v "$(dirname "${MODEL_GPU1}"):/models:ro" \
"${CONTAINER_IMAGE}" \
-m "/models/$(basename "${MODEL_GPU1}")" \
-ngl "${NGL}" \
--ctx-size "${CTX_SIZE}" \
--host "${HOST}" \
--port "${PORT}"
fi
exec env CUDA_VISIBLE_DEVICES="${GPU_INDEX}" "$LLAMA_SERVER_BIN" \
-m "$MODEL_GPU1" \
-ngl "${NGL}" \
--ctx-size "${CTX_SIZE}" \
--host "${HOST}" \
--port "${PORT}"