[ PROMPT_NODE_22782 ]
Observability Langsmith 高级用法
[ SKILL_DOCUMENTATION ]
# LangSmith 高级用法指南
## 自定义评估器
### 简单的自定义评估器
python
from langsmith import evaluate
def accuracy_evaluator(run, example):
"""检查预测是否与参考答案匹配。"""
prediction = run.outputs.get("answer", "")
reference = example.outputs.get("answer", "")
score = 1.0 if prediction.strip().lower() == reference.strip().lower() else 0.0
return {
"key": "accuracy",
"score": score,
"comment": f"Predicted: {prediction[:50]}..."
}
results = evaluate(
my_model,
data="test-dataset",
evaluators=[accuracy_evaluator]
)
### LLM-as-Judge 评估器
python
from langsmith import evaluate
from openai import OpenAI
client = OpenAI()
def llm_judge_evaluator(run, example):
"""使用 LLM 评估响应质量。"""
prediction = run.outputs.get("answer", "")
question = example.inputs.get("question", "")
reference = example.outputs.get("answer", "")
prompt = f"""评估以下响应的准确性和有用性。
问题: {question}
参考答案: {reference}
模型响应: {prediction}
评分范围 1-5:
1 = 完全错误
5 = 完美答案
仅返回数字。"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
max_tokens=10
)
try:
score = int(response.choices[0].message.content.strip()) / 5.0
except ValueError:
score = 0.5
return {
"key": "llm_judge",
"score": score,
"comment": response.choices[0].message.content
}
results = evaluate(
my_model,
data="test-dataset",
evaluators=[llm_judge_evaluator]
)
### 异步评估器
python
from langsmith import aevaluate
import asyncio
async def async_evaluator(run, example):
"""用于并发评估的异步评估器。"""
prediction = run.outputs.get("answer", "")
# 异步操作 (例如 API 调用)
score = await compute_similarity_async(prediction, example.outputs["answer"])
return {"key": "similarity", "score": score}
async def run_async_eval():
results = await aevaluate(
async_model,
data="test-dataset",
evaluators=[async_evaluator],
max_concurrency=10
)
return results
results = asyncio.run(run_async_eval())
### 多返回值
python
def comprehensive_evaluator(run, example):
"""返回多个评估结果