[ PROMPT_NODE_22829 ]
Optimization Hqq – Advanced Usage
[ SKILL_DOCUMENTATION ]
# HQQ Advanced Usage Guide
## Custom Backend Configuration
### Backend Selection by Hardware
```python
from hqq.core.quantize import HQQLinear
import torch
def select_optimal_backend():
"""Select best backend based on hardware."""
device = torch.cuda.get_device_properties(0)
compute_cap = device.major * 10 + device.minor
if compute_cap >= 80: # Ampere+
return "marlin"
elif compute_cap >= 70: # Volta/Turing
return "aten"
else:
return "pytorch_compile"
backend = select_optimal_backend()
HQQLinear.set_backend(backend)
print(f"Using backend: {backend}")
```
### Per-Layer Backend Assignment
```python
from hqq.core.quantize import HQQLinear
def set_layer_backends(model):
"""Assign optimal backends per layer type."""
for name, module in model.named_modules():
if isinstance(module, HQQLinear):
if "attn" in name:
module.set_backend("marlin") # Fast for attention
elif "mlp" in name:
module.set_backend("bitblas") # Flexible for MLP
else:
module.set_backend("aten")
set_layer_backends(model)
```
### TorchAO Integration
```python
from hqq.core.quantize import HQQLinear
import torchao
# Enable TorchAO int4 backend
HQQLinear.set_backend("torchao_int4")
# Configure TorchAO options
import torch
torch._inductor.config.coordinate_descent_tuning = True
torch._inductor.config.triton.unique_kernel_names = True
```
## Mixed Precision Quantization
### Layer-Specific Configuration
```python
from hqq.core.quantize import BaseQuantizeConfig
from transformers import AutoModelForCausalLM
# Define configs per layer pattern
quant_configs = {
# Embeddings: Keep full precision
"embed_tokens": None,
"lm_head": None,
# Attention: 4-bit with larger groups
"self_attn.q_proj": BaseQuantizeConfig(nbits=4, group_size=128),
"self_attn.k_proj": BaseQuantizeConfig(nbits=4, group_size=128),
"self_attn.v_proj": BaseQuantizeConfig(nbits=4, group_size=128),
"self_attn.o_proj": BaseQuantizeConfig(nbits=4, group_size=128),
# MLP: More aggressive 2-bit
"mlp.gate_proj": BaseQuantizeConfig(nbits=2, group_size=32),
"mlp.up_proj": BaseQuantizeConfig(nbits=2, group_size=32),
"mlp.down_proj": BaseQuantizeConfig(nbits=3, group_size=64),
}
def quantize_with_mixed_precision(model, configs):
"""Apply mixed precision quantization."""
from hqq.core.quantize import HQQLinear
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
for pattern, config in configs.items():
if pattern in name:
if config is None:
continue # Skip quantization
parent = get_parent_module(model, name)
setattr(parent, name.split(".")[-1],
HQQLinear(module, config))
break
return model
```
### Sensitivity-Based Quantization
```python
import torch
from hqq.core.quantize import BaseQuantizeConfig, HQQLinear
def measure_layer_sensitivity(model, calibration_data, layer_name):
"""Measure quantization sensitivity of a layer."""
original_output = None
quantized_output = None
# Get original output
def hook_original(module, input, output):
nonlocal original_output
original_output = output.clone()
layer = dict(model.named_modules())[layer_name]
handle = layer.register_forward_hook(hook_original)
with torch.no_grad():
model(calibration_data)
handle.remove()
# Quantize and measure error
for nbits in [4, 3, 2]:
config = BaseQuantizeConfig(nbits=nbits, group_size=64)
quant_layer = HQQLinear(layer, config)
with torch.no_grad():
quantized_output = quant_layer(calibration_data)
error = torch.mean((original_output - quantized_output) ** 2).item()
print(f"{layer_name} @ {nbits}-bit: MSE = {error:.6f}")
# Auto-select precision based on sensitivity
def auto_select_precision(sensitivity_results, threshold=0.01):
"""Select precision based on sensitivity threshold."""
configs = {}
for layer_name, errors in sensitivity_results.items():
for nbits, error in sorted(errors.items()):
if error BenchmarkResult:
"""Comprehensive benchmark for HQQ models."""
device = next(model.parameters()).device
# Prepare inputs
inputs = tokenizer(test_texts, return_tensors="pt", padding=True).to(device)
# Memory measurement
torch.cuda.reset_peak_memory_stats()
# Latency measurement
torch.cuda.synchronize()
start = time.time()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=False
)
torch.cuda.synchronize()
latency = (time.time() - start) * 1000
# Calculate metrics
total_tokens = outputs.shape[0] * outputs.shape[1]
throughput = total_tokens / (latency / 1000)
memory = torch.cuda.max_memory_allocated() / 1024 / 1024
# Perplexity (simplified)
with torch.no_grad():
outputs = model(**inputs, labels=inputs["input_ids"])
perplexity = torch.exp(outputs.loss).item()
return BenchmarkResult(
latency_ms=latency,
throughput=throughput,
memory_mb=memory,
perplexity=perplexity
)
# Compare different configurations
def compare_quantization_configs(model_name, configs: Dict[str, dict]):
"""Compare different HQQ configurations."""
results = {}
for name, config in configs.items():
print(f"nBenchmarking: {name}")
model = load_hqq_model(model_name, **config)
result = benchmark_hqq_model(model, tokenizer, test_texts)
results[name] = result
print(f" Latency: {result.latency_ms:.1f}ms")
print(f" Throughput: {result.throughput:.1f} tok/s")
print(f" Memory: {result.memory_mb:.1f}MB")
print(f" Perplexity: {result.perplexity:.2f}")
del model
torch.cuda.empty_cache()
return results
```
## Integration Examples
### LangChain Integration
```python
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig, pipeline
# Load HQQ model
config = HqqConfig(nbits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
# Create pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=256
)
# Wrap for LangChain
llm = HuggingFacePipeline(pipeline=pipe)
# Use in chain
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
prompt = PromptTemplate(
input_variables=["question"],
template="Answer the question: {question}"
)
chain = LLMChain(llm=llm, prompt=prompt)
result = chain.run("What is machine learning?")
```
### Gradio Interface
```python
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
# Load model
config = HqqConfig(nbits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
def generate(prompt, max_tokens, temperature):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=int(max_tokens),
temperature=temperature,
do_sample=temperature > 0
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
demo = gr.Interface(
fn=generate,
inputs=[
gr.Textbox(label="Prompt"),
gr.Slider(10, 500, value=100, label="Max Tokens"),
gr.Slider(0, 2, value=0.7, label="Temperature")
],
outputs=gr.Textbox(label="Output"),
title="HQQ Quantized LLM"
)
demo.launch()
```
Source: claude-code-templates (MIT). See About Us for full credits.