Model:
hf download leuconoe/Qwen3-8B-Instruct-GGUF Qwen3-8B-Q5_K_M-Instruct.gguf --local-dir ~/.models/`
TurboQuant: cd ~
git clone https://github.com/TheTom/llama-cpp-turboquant.git
cd llama-cpp-turboquant
git checkout feature/turboquant-kv-cache
Build for M4: cmake -B build \
-DGGML_METAL=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DCMAKE_BUILD_TYPE=Release
cmake --build build -j
llama-cli command: ~/llama-cpp-turboquant/build/bin/llama-cli \
-m ~/.models/Qwen3-8B-Q5_K_M-Instruct.gguf \
-ngl 999 \
--cache-type-k turbo3 \
--cache-type-v turbo3 \
--chat-template chatml \
-c 524288 \
-n 624 \
-p "Explain TuboQuant"