[ PROMPT_NODE_22830 ]
Optimization Hqq 高级用法
[ SKILL_DOCUMENTATION ]
# HQQ 高级用法指南
## 自定义后端配置
### 根据硬件选择后端
python
from hqq.core.quantize import HQQLinear
import torch
def select_optimal_backend():
"""根据硬件选择最佳后端。"""
device = torch.cuda.get_device_properties(0)
compute_cap = device.major * 10 + device.minor
if compute_cap >= 80: # Ampere+
return "marlin"
elif compute_cap >= 70: # Volta/Turing
return "aten"
else:
return "pytorch_compile"
backend = select_optimal_backend()
HQQLinear.set_backend(backend)
print(f"正在使用后端: {backend}")
### 分层后端分配
python
from hqq.core.quantize import HQQLinear
def set_layer_backends(model):
"""为不同层类型分配最佳后端。"""
for name, module in model.named_modules():
if isinstance(module, HQQLinear):
if "attn" in name:
module.set_backend("marlin") # 注意力层加速
elif "mlp" in name:
module.set_backend("bitblas") # MLP 层灵活配置
else:
module.set_backend("aten")
set_layer_backends(model)
### TorchAO 集成
python
from hqq.core.quantize import HQQLinear
import torchao
# 启用 TorchAO int4 后端
HQQLinear.set_backend("torchao_int4")
# 配置 TorchAO 选项
import torch
torch._inductor.config.coordinate_descent_tuning = True
torch._inductor.config.triton.unique_kernel_names = True
## 混合精度量化
### 层级特定配置
python
from hqq.core.quantize import BaseQuantizeConfig
from transformers import AutoModelForCausalLM
# 定义各层模式的配置
quant_configs = {
# Embeddings: 保持全精度
"embed_tokens": None,
"lm_head": None,
# Attention: 4-bit,较大分组
"self_attn.q_proj": BaseQuantizeConfig(nbits=4, group_size=128),
"self_attn.k_proj": BaseQuantizeConfig(nbits=4, group_size=128),
"self_attn.v_proj": BaseQuantizeConfig(nbits=4, group_size=128),
"self_attn.o_proj": BaseQuantizeConfig(nbits=4, group_size=128),
# MLP: 更激进的 2-bit
"mlp.gate_proj": BaseQuantizeConfig(nbits=2, group_size=32),
"mlp.up_proj": BaseQuantizeConfig(nbits=2, group_size=32),
"mlp.down_proj": BaseQuantizeConfig(nbits=3, group_size=64),
}
def quantize_with_mixed_precision(model, configs):
"""应用混合精度量化。"""
from hqq.core.quantize import HQQLinear
for name, modu