[ PROMPT_NODE_22781 ]
Observability Langsmith – Advanced Usage
[ SKILL_DOCUMENTATION ]
# LangSmith Advanced Usage Guide
## Custom Evaluators
### Simple Custom Evaluator
```python
from langsmith import evaluate
def accuracy_evaluator(run, example):
"""Check if prediction matches reference."""
prediction = run.outputs.get("answer", "")
reference = example.outputs.get("answer", "")
score = 1.0 if prediction.strip().lower() == reference.strip().lower() else 0.0
return {
"key": "accuracy",
"score": score,
"comment": f"Predicted: {prediction[:50]}..."
}
results = evaluate(
my_model,
data="test-dataset",
evaluators=[accuracy_evaluator]
)
```
### LLM-as-Judge Evaluator
```python
from langsmith import evaluate
from openai import OpenAI
client = OpenAI()
def llm_judge_evaluator(run, example):
"""Use LLM to evaluate response quality."""
prediction = run.outputs.get("answer", "")
question = example.inputs.get("question", "")
reference = example.outputs.get("answer", "")
prompt = f"""Evaluate the following response for accuracy and helpfulness.
Question: {question}
Reference Answer: {reference}
Model Response: {prediction}
Rate on a scale of 1-5:
1 = Completely wrong
5 = Perfect answer
Respond with just the number."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
max_tokens=10
)
try:
score = int(response.choices[0].message.content.strip()) / 5.0
except ValueError:
score = 0.5
return {
"key": "llm_judge",
"score": score,
"comment": response.choices[0].message.content
}
results = evaluate(
my_model,
data="test-dataset",
evaluators=[llm_judge_evaluator]
)
```
### Async Evaluator
```python
from langsmith import aevaluate
import asyncio
async def async_evaluator(run, example):
"""Async evaluator for concurrent evaluation."""
prediction = run.outputs.get("answer", "")
# Async operation (e.g., API call)
score = await compute_similarity_async(prediction, example.outputs["answer"])
return {"key": "similarity", "score": score}
async def run_async_eval():
results = await aevaluate(
async_model,
data="test-dataset",
evaluators=[async_evaluator],
max_concurrency=10
)
return results
results = asyncio.run(run_async_eval())
```
### Multiple Return Values
```python
def comprehensive_evaluator(run, example):
"""Return multiple evaluation results."""
prediction = run.outputs.get("answer", "")
reference = example.outputs.get("answer", "")
return [
{"key": "exact_match", "score": 1.0 if prediction == reference else 0.0},
{"key": "length_ratio", "score": min(len(prediction) / max(len(reference), 1), 1.0)},
{"key": "contains_reference", "score": 1.0 if reference.lower() in prediction.lower() else 0.0}
]
```
## Summary Evaluators
```python
def summary_evaluator(runs, examples):
"""Compute aggregate metrics across all runs."""
total_latency = sum(
(run.end_time - run.start_time).total_seconds()
for run in runs if run.end_time and run.start_time
)
avg_latency = total_latency / len(runs) if runs else 0
return {
"key": "avg_latency",
"score": avg_latency
}
results = evaluate(
my_model,
data="test-dataset",
evaluators=[accuracy_evaluator],
summary_evaluators=[summary_evaluator]
)
```
## Comparative Evaluation
```python
from langsmith import evaluate_comparative
def pairwise_judge(runs, example):
"""Compare two model outputs."""
output_a = runs[0].outputs.get("answer", "")
output_b = runs[1].outputs.get("answer", "")
reference = example.outputs.get("answer", "")
# Use LLM to compare
prompt = f"""Compare these two answers to the question.
Question: {example.inputs['question']}
Reference: {reference}
Answer A: {output_a}
Answer B: {output_b}
Which is better? Respond with 'A', 'B', or 'TIE'."""
response = llm.invoke(prompt)
if "A" in response:
return {"key": "preference", "scores": {"model_a": 1.0, "model_b": 0.0}}
elif "B" in response:
return {"key": "preference", "scores": {"model_a": 0.0, "model_b": 1.0}}
else:
return {"key": "preference", "scores": {"model_a": 0.5, "model_b": 0.5}}
results = evaluate_comparative(
["experiment-a-id", "experiment-b-id"],
evaluators=[pairwise_judge]
)
```
## Advanced Tracing
### Run Trees
```python
from langsmith import RunTree
# Create root run
root = RunTree(
name="complex_pipeline",
run_type="chain",
inputs={"query": "What is AI?"},
project_name="my-project"
)
# Create child run
child = root.create_child(
name="retrieval_step",
run_type="retriever",
inputs={"query": "What is AI?"}
)
# Execute and record
docs = retriever.invoke("What is AI?")
child.end(outputs={"documents": docs})
# Another child
llm_child = root.create_child(
name="llm_call",
run_type="llm",
inputs={"prompt": f"Context: {docs}nnQuestion: What is AI?"}
)
response = llm.invoke(...)
llm_child.end(outputs={"response": response})
# End root
root.end(outputs={"answer": response})
```
### Distributed Tracing
```python
from langsmith import get_current_run_tree
from langsmith.run_helpers import get_tracing_context
# Get current trace context
context = get_tracing_context()
run_tree = get_current_run_tree()
# Pass to another service
trace_headers = {
"langsmith-trace": run_tree.trace_id,
"langsmith-parent": run_tree.id
}
# In receiving service
from langsmith import RunTree
child_run = RunTree(
name="remote_operation",
run_type="tool",
parent_run_id=headers["langsmith-parent"],
trace_id=headers["langsmith-trace"]
)
```
### Attachments
```python
from langsmith import Client
client = Client()
# Attach files to examples
client.create_example(
inputs={"query": "Describe this image"},
outputs={"description": "A sunset over mountains"},
attachments={
"image": ("image/jpeg", image_bytes)
},
dataset_id=dataset.id
)
# Attach to runs
from langsmith import traceable
@traceable(dangerously_allow_filesystem=True)
def process_file(file_path: str):
with open(file_path, "rb") as f:
return {"result": analyze(f.read())}
```
## Hub Prompts
### Pull and Use Prompts
```python
from langsmith import Client
client = Client()
# Pull prompt from hub
prompt = client.pull_prompt("langchain-ai/rag-prompt")
# Use prompt
response = prompt.invoke({
"context": "Python is a programming language...",
"question": "What is Python?"
})
```
### Push Prompts
```python
from langchain_core.prompts import ChatPromptTemplate
# Create prompt
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful {role}."),
("user", "{question}")
])
# Push to hub
client.push_prompt("my-org/my-prompt", object=prompt)
# Push with tags
client.push_prompt(
"my-org/my-prompt",
object=prompt,
tags=["production", "v2"]
)
```
### Versioned Prompts
```python
# Pull specific version
prompt_v1 = client.pull_prompt("my-org/my-prompt", commit_hash="abc123")
# Pull latest
prompt_latest = client.pull_prompt("my-org/my-prompt")
# Compare versions
print(f"V1 template: {prompt_v1}")
print(f"Latest template: {prompt_latest}")
```
## Dataset Management
### Create from Runs
```python
from langsmith import Client
client = Client()
# Create dataset from existing runs
runs = client.list_runs(
project_name="production",
filter='and(eq(feedback_key, "user_rating"), gt(feedback_score, 0.8))'
)
# Convert to examples
examples = []
for run in runs:
examples.append({
"inputs": run.inputs,
"outputs": run.outputs
})
# Create dataset
dataset = client.create_dataset("high-quality-examples")
client.create_examples(
inputs=[e["inputs"] for e in examples],
outputs=[e["outputs"] for e in examples],
dataset_id=dataset.id
)
```
### Dataset Splits
```python
from langsmith import Client
import random
client = Client()
# Get all examples
examples = list(client.list_examples(dataset_name="my-dataset"))
random.shuffle(examples)
# Split
train_size = int(0.8 * len(examples))
train_examples = examples[:train_size]
test_examples = examples[train_size:]
# Create split datasets
train_dataset = client.create_dataset("my-dataset-train")
test_dataset = client.create_dataset("my-dataset-test")
for ex in train_examples:
client.create_example(inputs=ex.inputs, outputs=ex.outputs, dataset_id=train_dataset.id)
for ex in test_examples:
client.create_example(inputs=ex.inputs, outputs=ex.outputs, dataset_id=test_dataset.id)
```
### Upload from CSV
```python
from langsmith import Client
client = Client()
# Upload CSV directly
dataset = client.upload_csv(
csv_file="./qa_data.csv",
input_keys=["question"],
output_keys=["answer"],
name="qa-dataset",
description="QA pairs from CSV"
)
```
## Filtering and Querying
### Run Filters
```python
from langsmith import Client
client = Client()
# Complex filters
runs = client.list_runs(
project_name="production",
filter='and(eq(status, "success"), gt(latency, 2.0))',
execution_order=1, # Only root runs
start_time="2024-01-01T00:00:00Z",
end_time="2024-12-31T23:59:59Z"
)
# Filter by tags
runs = client.list_runs(
project_name="production",
filter='has(tags, "production")'
)
# Filter by error
runs = client.list_runs(
project_name="production",
filter='eq(status, "error")'
)
```
### Feedback Queries
```python
# Get runs with specific feedback
runs = client.list_runs(
project_name="production",
filter='and(eq(feedback_key, "user_rating"), lt(feedback_score, 0.5))'
)
# Aggregate feedback
from collections import defaultdict
feedback_by_key = defaultdict(list)
for feedback in client.list_feedback(project_name="production"):
feedback_by_key[feedback.key].append(feedback.score)
for key, scores in feedback_by_key.items():
print(f"{key}: avg={sum(scores)/len(scores):.2f}, count={len(scores)}")
```
## OpenTelemetry Integration
```python
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from langsmith import Client
# Set up OTel
provider = TracerProvider()
trace.set_tracer_provider(provider)
# Create client with OTel integration
client = Client(otel_tracer_provider=provider)
# Traces will be exported to both LangSmith and OTel backends
```
## Multi-Tenant Setup
```python
from langsmith import Client
# Configure multiple endpoints
api_urls = {
"https://api-team1.langsmith.com": "api_key_1",
"https://api-team2.langsmith.com": "api_key_2"
}
# Client writes to all endpoints
client = Client(api_urls=api_urls)
# All operations replicated
client.create_run(
name="shared_operation",
run_type="chain",
inputs={"query": "test"}
)
```
## Batch Operations
```python
from langsmith import Client
client = Client()
# Batch create examples
inputs = [{"q": f"Question {i}"} for i in range(1000)]
outputs = [{"a": f"Answer {i}"} for i in range(1000)]
client.create_examples(
inputs=inputs,
outputs=outputs,
dataset_id=dataset.id
)
# Batch update examples
example_ids = [ex.id for ex in client.list_examples(dataset_id=dataset.id)]
client.update_examples(
example_ids=example_ids,
metadata=[{"updated": True} for _ in example_ids]
)
# Batch delete
client.delete_examples(example_ids=example_ids[:100])
```
## Caching and Performance
```python
from langsmith import Client
from functools import lru_cache
client = Client()
# Cache dataset lookups
@lru_cache(maxsize=100)
def get_dataset_id(name: str) -> str:
dataset = client.read_dataset(dataset_name=name)
return str(dataset.id)
# Batch tracing for high throughput
client = Client(auto_batch_tracing=True)
# Control batch size
import os
os.environ["LANGSMITH_BATCH_SIZE"] = "100"
os.environ["LANGSMITH_BATCH_INTERVAL_MS"] = "1000"
```
Source: claude-code-templates (MIT). See About Us for full credits.