llama.cpp를 모니터링하고 메모리 사용량을 분석하는 스크립트

저의 목표는 항상 일반적인 하드웨어(commodity hardware)로 생산성을 높이는 것이었습니다. 지금까지 저의 주력 모델은 16GB RAM을 가진 단일 9060XT가 장착된 오래된 데스크톱에서 구동되는 gemma 4 및 Qwen 3.6의 MoE 버전들이었습니다.

문제는 항상 모든 자료가 VRAM/RAM 요구 사항에 대해 모호하게 설명한다는 점이었습니다. 모델들은 16비트(16 bits)로 학습되지만, 많은 가이드가 빠르지만 분명히 성능이 제한된 Q4를 제안하는 반면, 대부분의 사람들은 Q6 또는 Q8에서 좋은 결과를 얻는 경향이 있습니다. 하지만 이는 RAM 요구 사항을 예측하기 어렵게 만듭니다. 그래서 저는 llama.cpp의 상세 출력(verbose output)을 파싱하여 검토하기 쉬운 요약본을 제공하는 스크립트를 만들기로 결연했습니다.

첨부된 이미지에서 그 출력을 확인할 수 있습니다. 이 스크립트는 모든 버퍼 할당(buffer allocations)을 읽고, 이를 기능(function) 및 백엔드(backend)별로 그룹화하며, 사용자의 설정에서 어떤 일이 일어나고 있는지 파악하고 그에 따라 계획을 세울 수 있도록 유용한 합계(sums)를 제공합니다.

또한 t/s(tokens per second) 또는 MTP 성능과 같이 누구나 유용하게 느낄만한 몇 가지 이해하기 쉬운 통계도 제공합니다.

아래는 실제 스크립트입니다. 이 스크립트는 Linux 환경을 예상하며, 사용자의 llama.cpp 명령어가 상세 출력(-v 플래그)을 포함하는 run.sh라는 스크립트에 들어있다고 가정합니다.

이 스크립트는 ChatGPT를 사용하여 vibe coding(직관적 코딩)되었으며, 아마도 더 우아한 종료(graceful shutdown)를 돕기 위해 여전히 약간의 작업이 필요할 것입니다.

이것이 여러분에게 유용하기를 바랍니다.
#!/usr/bin/env bash set -euo pipefail RUN_SCRIPT="${RUN_SCRIPT:-./run.sh}" LOG_FILE="${LOG_FILE:-/tmp/llama-run.log}" MEM_FILE="${MEM_FILE:-/tmp/llama-mem.tsv}" STAT_FILE="${STAT_FILE:-/tmp/llama-stats.tsv}" INFO_FILE="${INFO_FILE:-/tmp/llama-info.tsv}" INTERVAL="${INTERVAL:-2}"
: > "$LOG_FILE" : > "$MEM_FILE" : > "$STAT_FILE" : > "$INFO_FILE"
parse_buffer_line() { sed -nE 's/.* ([A-Za-z0-9_]+)[[:space:]]+([A-Za-z]+) buffer size =[[:space:]]([0-9.]+) MiB./\1:\2\t\3/p' }
parse_info_line() { awk ' /llama_model_loader:/ && /general.name/ { line=$0 sub(/.general.name[[:space:]]+str[[:space:]]=[[:space:]]/, "", line) if (line != "") print "model_name\t" line } /llm_load_print_meta:/ && /model ftype/ { line=$0 sub(/.model ftype[[:space:]]=[[:space:]]/, "", line) if (line != "") print "model_quant\t" line } /"model":/ { line=$0 if (match(line, /"model":"[^"]+"/)) { model=substr(line, RSTART+9, RLENGTH-10) print "model_name\t" model if (match(model, /:([^:]+)$/)) { q=substr(model, RSTART+1, RLENGTH-1) print "model_quant\t" q } } } ' }
parse_stat_line() { awk ' /prompt eval time/ && /tokens per second/ { line=$0 sub(/.(/, "", line) sub(/[[:space:]]tokens per second./, "", line) sub(/.,[[:space:]]/, "", line) print "pp_tps\t" line } /eval time/ && !/prompt eval time/ && /tokens per second/ { line=$0 sub(/.(/, "", line) sub(/[[:space:]]tokens per second./, "", line) sub(/.,[[:space:]]/, "", line) print "tg_tps\t" line } /prompt_per_second/ { line=$0 if (match(line, /"prompt_per_second":[0-9.]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.:/, "", v) print "pp_tps\t" v } } /predicted_per_second/ { line=$0 if (match(line, /"predicted_per_second":[0-9.]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.:/, "", v) print "tg_tps\t" v } } /n_ctx[[:space:]]=/ { line=$0 sub(/.n_ctx[[:space:]]=[[:space:]]/, "", line) sub(/[^0-9]./, "", line) if (line != "") print "n_ctx\t" line } /n_tokens[[:space:]]=/ { line=$0

, line) if (line != "" print "n_ctx " line } /n_tokens[[:space:]]=/ { line=$0 sub(/.n_tokens[[:space:]]=[[:space:]]/, "", line) sub(/[^0-9]./, "", line) if (line != "") print "ctx_used " line } /draft acceptance[[:space:]]=/ { line=$0 sub(/.draft acceptance[[:space:]]=[[:space:]]/, "", line) sub(/[[:space:]]./, "", line) print "mtp_acceptance " line } /accepted[[:space:]]+[0-9]+/[0-9]+ draft tokens/ { line=$0 sub(/.accepted[[:space:]]+/, "", line) sub(/[[:space:]]+draft tokens./, "", line) print "mtp_last_accept " line } /statistics[[:space:]]+draft-mtp:/ { line=$0 if (match(line, /#gen tokens =[[:space:]][0-9]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.=[[:space:]]/, "", v) print "mtp_gen_tokens " v } if (match(line, /#acc tokens =[[:space:]][0-9]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.=[[:space:]]/, "", v) print "mtp_acc_tokens " v } if (match(line, /#mean acc len =[[:space:]][0-9.]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.=[[:space:]]*/, "", v) print "mtp_mean_len " v } } ' } "$RUN_SCRIPT" "$@" -v > >(tee -a "$LOG_FILE" >/dev/null) 2> >( tee -a "$LOG_FILE" | while IFS= read -r line; do parsed_mem="
$line" | parse_buffer_line || true)" [[ -n "$parsed_mem" ]] && printf '%s
' "$parsed_mem" >> "$MEM_FILE"
parsed_info="
$line" | parse_info_line || true)" [[ -n "$parsed_info" ]] && printf '%s
' "$parsed_info" >> "$INFO_FILE"
parsed_stat="
$line" | parse_stat_line || true)" [[ -n "$parsed_stat" ]] && printf '%s
' "$parsed_stat" >> "$STAT_FILE"
done ) & LLAMA_PID=$! trap 'kill "$LLAMA_PID" 2>/dev/null || true; exit' INT TERM EXIT while kill -0 "$LLAMA_PID" 2>/dev/null; do clear echo "llama.cpp 모니터링" echo "PID: $LLAMA_PID" echo "Log: $LOG_FILE" echo
echo "모델 정보" echo

info["model_quant"] : "-"} ' "$INFO_FILE" echo echo "Runtime stats" echo "-------------" awk -F '\t' ' { stat[$1] = $2 } END { printf "%-20s %s\n", "Prompt eval t/s", stat["pp_tps"] ? stat["pp_tps"] : "-" printf "%-20s %s\n", "Token gen t/s", stat["tg_tps"] ? stat["tg_tps"] : "-" printf "%-20s %s\n", "Context used", stat["ctx_used"] ? stat["ctx_used"] : "-" printf "%-20s %s\n", "Context size", stat["n_ctx"] ? stat["n_ctx"] : "-" printf "%-20s %s\n", "MTP acceptance", stat["mtp_acceptance"] ? stat["mtp_acceptance"] : "-" printf "%-20s %s\n", "MTP accepted", stat["mtp_acc_tokens"] && stat["mtp_gen_tokens"] ? stat["mtp_acc_tokens"] "/" stat["mtp_gen_tokens"] : "-" printf "%-20s %s\n", "MTP mean len", stat["mtp_mean_len"] ? stat["mtp_mean_len"] : "-" printf "%-20s %s\n", "MTP last accept", stat["mtp_last_accept"] ? stat["mtp_last_accept"] : "-" } ' "$STAT_FILE" echo echo "Memory buffers" echo "--------------" if [[ ! -s "$MEM_FILE" ]]; then echo "Waiting for buffer allocation lines..." else awk -F '\t' ' { latest[$1] = $2 } END { grand = 0 n = asorti(latest, keys) printf "%-40s %12s\n", "Buffer", "MiB" printf "%-40s %12s\n", "------", "---" for (i = 1; i <= n; i++) { key = keys[i] mib = latest[key] + 0 split(key, parts, ":") backend = parts[1] type = parts[2] backend_total[backend] += mib type_total[type] += mib grand += mib printf "%-40s %12.2f\n", key, mib } printf "\n" printf "%-40s %12s\n", "Backend totals", "MiB" printf "%-40s %12s\n", "--------------", "---" m = asorti(backend_total, backend_keys) for (i = 1; i <= m; i++) { backend = backend_keys[i] printf "%-40s %12.2f\n", backend, backend_total[backend] } printf "\n" printf "%-40s %12s\n", "Allocation totals", "MiB" printf "%-40s %12s\n", "-----------------", "---" t = asorti(type_total, type_keys) for (i = 1; i <= t; i++) { type = type_keys[i] printf "%-40s %12.2f\n", type, type_total[type] } printf "\n" printf "%-40s %12.2f MiB\n", "Grand total explicit", grand printf "%-40s %12.2f GiB\n", "Grand total

explicit", grand / 1024 } ' "$MEM_FILE" fi; sleep "$INTERVAL"; done; wait "$LLAMA_PID"
submitted by /u/j0hnp0s
[link] [comments]

llama.cpp를 모니터링하고 메모리 사용량을 분석하는 스크립트

요약

핵심 포인트

댓글