33 lines
1018 B
Bash
33 lines
1018 B
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
MODEL_GPU1="${MODEL_GPU1:-/home/netuser/bin/models/llm/Qwen3.5-9B-Q5_K_M.gguf}"
|
|
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/home/netuser/bin/llama.cpp/build/bin/llama-server}"
|
|
HOST="${GPU1_HOST:-127.0.0.1}"
|
|
PORT="${GPU1_PORT:-18092}"
|
|
CTX_SIZE="${GPU1_CTX_SIZE:-4096}"
|
|
NGL="${GPU1_NGL:-999}"
|
|
GPU_INDEX="${GPU1_INDEX:-1}"
|
|
USE_CONTAINER="${GPU1_USE_CONTAINER:-0}"
|
|
CONTAINER_IMAGE="${GPU1_CONTAINER_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-cuda}"
|
|
|
|
if [[ "${USE_CONTAINER}" == "1" ]]; then
|
|
exec docker run --rm --gpus all \
|
|
--network host \
|
|
-e CUDA_VISIBLE_DEVICES="${GPU_INDEX}" \
|
|
-v "$(dirname "${MODEL_GPU1}"):/models:ro" \
|
|
"${CONTAINER_IMAGE}" \
|
|
-m "/models/$(basename "${MODEL_GPU1}")" \
|
|
-ngl "${NGL}" \
|
|
--ctx-size "${CTX_SIZE}" \
|
|
--host "${HOST}" \
|
|
--port "${PORT}"
|
|
fi
|
|
|
|
exec env CUDA_VISIBLE_DEVICES="${GPU_INDEX}" "$LLAMA_SERVER_BIN" \
|
|
-m "$MODEL_GPU1" \
|
|
-ngl "${NGL}" \
|
|
--ctx-size "${CTX_SIZE}" \
|
|
--host "${HOST}" \
|
|
--port "${PORT}"
|