[ PROMPT_NODE_22782 ]

Observability Langsmith 高级用法

[ SKILL_DOCUMENTATION ]

# LangSmith 高级用法指南 ## 自定义评估器 ### 简单的自定义评估器 python from langsmith import evaluate def accuracy_evaluator(run, example): """检查预测是否与参考答案匹配。""" prediction = run.outputs.get("answer", "") reference = example.outputs.get("answer", "") score = 1.0 if prediction.strip().lower() == reference.strip().lower() else 0.0 return { "key": "accuracy", "score": score, "comment": f"Predicted: {prediction[:50]}..." } results = evaluate( my_model, data="test-dataset", evaluators=[accuracy_evaluator] ) ### LLM-as-Judge 评估器 python from langsmith import evaluate from openai import OpenAI client = OpenAI() def llm_judge_evaluator(run, example): """使用 LLM 评估响应质量。""" prediction = run.outputs.get("answer", "") question = example.inputs.get("question", "") reference = example.outputs.get("answer", "") prompt = f"""评估以下响应的准确性和有用性。问题: {question} 参考答案: {reference} 模型响应: {prediction} 评分范围 1-5: 1 = 完全错误 5 = 完美答案仅返回数字。""" response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": prompt}], max_tokens=10 ) try: score = int(response.choices[0].message.content.strip()) / 5.0 except ValueError: score = 0.5 return { "key": "llm_judge", "score": score, "comment": response.choices[0].message.content } results = evaluate( my_model, data="test-dataset", evaluators=[llm_judge_evaluator] ) ### 异步评估器 python from langsmith import aevaluate import asyncio async def async_evaluator(run, example): """用于并发评估的异步评估器。""" prediction = run.outputs.get("answer", "") # 异步操作 (例如 API 调用) score = await compute_similarity_async(prediction, example.outputs["answer"]) return {"key": "similarity", "score": score} async def run_async_eval(): results = await aevaluate( async_model, data="test-dataset", evaluators=[async_evaluator], max_concurrency=10 ) return results results = asyncio.run(run_async_eval()) ### 多返回值 python def comprehensive_evaluator(run, example): """返回多个评估结果

数据来源：claude-code-templates（MIT），中文翻译由 AI 生成。详见关于我们。

BAGUA AI