[ PROMPT_NODE_22816 ]

Optimization Gguf 高级用法

[ SKILL_DOCUMENTATION ]

# GGUF 进阶使用指南 ## 推测解码 (Speculative Decoding) ### 草稿模型方法 bash # 使用较小的模型作为草稿以加快生成速度 ./llama-speculative -m large-model-q4_k_m.gguf -md draft-model-q4_k_m.gguf -p "Write a story about AI" -n 500 --draft 8 # 验证前的草稿 token 数 ### 自推测解码 (Self-Speculative Decoding) bash # 使用相同模型但带有不同上下文进行推测 ./llama-cli -m model-q4_k_m.gguf --lookup-cache-static lookup.bin --lookup-cache-dynamic lookup-dynamic.bin -p "Hello world" ## 批处理推理 ### 处理多个提示词 python from llama_cpp import Llama llm = Llama( model_path="model-q4_k_m.gguf", n_ctx=4096, n_gpu_layers=35, n_batch=512 # 更大的批次用于并行处理 ) prompts = [ "What is Python?", "Explain machine learning.", "Describe neural networks." ] # 批处理 (每个提示词获得独立的上下文) for prompt in prompts: output = llm(prompt, max_tokens=100) print(f"Q: {prompt}") print(f"A: {output['choices'][0]['text']}n") ### 服务端批处理 bash # 启动带有批处理的服务端 ./llama-server -m model-q4_k_m.gguf --host 0.0.0.0 --port 8080 -ngl 35 -c 4096 --parallel 4 # 并发请求 --cont-batching # 连续批处理 ## 自定义模型转换 ### 带词表修改的转换 python # custom_convert.py import sys sys.path.insert(0, './llama.cpp') from convert_hf_to_gguf import main from gguf import GGUFWriter # 带自定义词表的转换 def convert_with_custom_vocab(model_path, output_path): # 加载并修改分词器 from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_path) # 如有需要添加特殊 token special_tokens = {"additional_special_tokens": [""]} tokenizer.add_special_tokens(special_tokens) tokenizer.save_pretrained(model_path) # 然后运行标准转换 main([model_path, "--outfile", output_path]) ### 转换特定架构 bash # Mistral 风格模型 python convert_hf_to_gguf.py ./mistral-model --outfile mistral-f16.gguf --outtype f16 # Qwen 模型 python convert_hf_to_gguf.py ./qwen-model --outfile qwen-f16.gguf --outtype f16 # Phi 模型 python convert_hf_to_gguf.py ./phi-model --outfile phi-f16.gguf --outtype f16 ## 进阶

数据来源：claude-code-templates（MIT），中文翻译由 AI 生成。详见关于我们。

BAGUA AI