[ PROMPT_NODE_22781 ]

Observability Langsmith – Advanced Usage

[ SKILL_DOCUMENTATION ]

# LangSmith Advanced Usage Guide ## Custom Evaluators ### Simple Custom Evaluator ```python from langsmith import evaluate def accuracy_evaluator(run, example): """Check if prediction matches reference.""" prediction = run.outputs.get("answer", "") reference = example.outputs.get("answer", "") score = 1.0 if prediction.strip().lower() == reference.strip().lower() else 0.0 return { "key": "accuracy", "score": score, "comment": f"Predicted: {prediction[:50]}..." } results = evaluate( my_model, data="test-dataset", evaluators=[accuracy_evaluator] ) ``` ### LLM-as-Judge Evaluator ```python from langsmith import evaluate from openai import OpenAI client = OpenAI() def llm_judge_evaluator(run, example): """Use LLM to evaluate response quality.""" prediction = run.outputs.get("answer", "") question = example.inputs.get("question", "") reference = example.outputs.get("answer", "") prompt = f"""Evaluate the following response for accuracy and helpfulness. Question: {question} Reference Answer: {reference} Model Response: {prediction} Rate on a scale of 1-5: 1 = Completely wrong 5 = Perfect answer Respond with just the number.""" response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": prompt}], max_tokens=10 ) try: score = int(response.choices[0].message.content.strip()) / 5.0 except ValueError: score = 0.5 return { "key": "llm_judge", "score": score, "comment": response.choices[0].message.content } results = evaluate( my_model, data="test-dataset", evaluators=[llm_judge_evaluator] ) ``` ### Async Evaluator ```python from langsmith import aevaluate import asyncio async def async_evaluator(run, example): """Async evaluator for concurrent evaluation.""" prediction = run.outputs.get("answer", "") # Async operation (e.g., API call) score = await compute_similarity_async(prediction, example.outputs["answer"]) return {"key": "similarity", "score": score} async def run_async_eval(): results = await aevaluate( async_model, data="test-dataset", evaluators=[async_evaluator], max_concurrency=10 ) return results results = asyncio.run(run_async_eval()) ``` ### Multiple Return Values ```python def comprehensive_evaluator(run, example): """Return multiple evaluation results.""" prediction = run.outputs.get("answer", "") reference = example.outputs.get("answer", "") return [ {"key": "exact_match", "score": 1.0 if prediction == reference else 0.0}, {"key": "length_ratio", "score": min(len(prediction) / max(len(reference), 1), 1.0)}, {"key": "contains_reference", "score": 1.0 if reference.lower() in prediction.lower() else 0.0} ] ``` ## Summary Evaluators ```python def summary_evaluator(runs, examples): """Compute aggregate metrics across all runs.""" total_latency = sum( (run.end_time - run.start_time).total_seconds() for run in runs if run.end_time and run.start_time ) avg_latency = total_latency / len(runs) if runs else 0 return { "key": "avg_latency", "score": avg_latency } results = evaluate( my_model, data="test-dataset", evaluators=[accuracy_evaluator], summary_evaluators=[summary_evaluator] ) ``` ## Comparative Evaluation ```python from langsmith import evaluate_comparative def pairwise_judge(runs, example): """Compare two model outputs.""" output_a = runs[0].outputs.get("answer", "") output_b = runs[1].outputs.get("answer", "") reference = example.outputs.get("answer", "") # Use LLM to compare prompt = f"""Compare these two answers to the question. Question: {example.inputs['question']} Reference: {reference} Answer A: {output_a} Answer B: {output_b} Which is better? Respond with 'A', 'B', or 'TIE'.""" response = llm.invoke(prompt) if "A" in response: return {"key": "preference", "scores": {"model_a": 1.0, "model_b": 0.0}} elif "B" in response: return {"key": "preference", "scores": {"model_a": 0.0, "model_b": 1.0}} else: return {"key": "preference", "scores": {"model_a": 0.5, "model_b": 0.5}} results = evaluate_comparative( ["experiment-a-id", "experiment-b-id"], evaluators=[pairwise_judge] ) ``` ## Advanced Tracing ### Run Trees ```python from langsmith import RunTree # Create root run root = RunTree( name="complex_pipeline", run_type="chain", inputs={"query": "What is AI?"}, project_name="my-project" ) # Create child run child = root.create_child( name="retrieval_step", run_type="retriever", inputs={"query": "What is AI?"} ) # Execute and record docs = retriever.invoke("What is AI?") child.end(outputs={"documents": docs}) # Another child llm_child = root.create_child( name="llm_call", run_type="llm", inputs={"prompt": f"Context: {docs}nnQuestion: What is AI?"} ) response = llm.invoke(...) llm_child.end(outputs={"response": response}) # End root root.end(outputs={"answer": response}) ``` ### Distributed Tracing ```python from langsmith import get_current_run_tree from langsmith.run_helpers import get_tracing_context # Get current trace context context = get_tracing_context() run_tree = get_current_run_tree() # Pass to another service trace_headers = { "langsmith-trace": run_tree.trace_id, "langsmith-parent": run_tree.id } # In receiving service from langsmith import RunTree child_run = RunTree( name="remote_operation", run_type="tool", parent_run_id=headers["langsmith-parent"], trace_id=headers["langsmith-trace"] ) ``` ### Attachments ```python from langsmith import Client client = Client() # Attach files to examples client.create_example( inputs={"query": "Describe this image"}, outputs={"description": "A sunset over mountains"}, attachments={ "image": ("image/jpeg", image_bytes) }, dataset_id=dataset.id ) # Attach to runs from langsmith import traceable @traceable(dangerously_allow_filesystem=True) def process_file(file_path: str): with open(file_path, "rb") as f: return {"result": analyze(f.read())} ``` ## Hub Prompts ### Pull and Use Prompts ```python from langsmith import Client client = Client() # Pull prompt from hub prompt = client.pull_prompt("langchain-ai/rag-prompt") # Use prompt response = prompt.invoke({ "context": "Python is a programming language...", "question": "What is Python?" }) ``` ### Push Prompts ```python from langchain_core.prompts import ChatPromptTemplate # Create prompt prompt = ChatPromptTemplate.from_messages([ ("system", "You are a helpful {role}."), ("user", "{question}") ]) # Push to hub client.push_prompt("my-org/my-prompt", object=prompt) # Push with tags client.push_prompt( "my-org/my-prompt", object=prompt, tags=["production", "v2"] ) ``` ### Versioned Prompts ```python # Pull specific version prompt_v1 = client.pull_prompt("my-org/my-prompt", commit_hash="abc123") # Pull latest prompt_latest = client.pull_prompt("my-org/my-prompt") # Compare versions print(f"V1 template: {prompt_v1}") print(f"Latest template: {prompt_latest}") ``` ## Dataset Management ### Create from Runs ```python from langsmith import Client client = Client() # Create dataset from existing runs runs = client.list_runs( project_name="production", filter='and(eq(feedback_key, "user_rating"), gt(feedback_score, 0.8))' ) # Convert to examples examples = [] for run in runs: examples.append({ "inputs": run.inputs, "outputs": run.outputs }) # Create dataset dataset = client.create_dataset("high-quality-examples") client.create_examples( inputs=[e["inputs"] for e in examples], outputs=[e["outputs"] for e in examples], dataset_id=dataset.id ) ``` ### Dataset Splits ```python from langsmith import Client import random client = Client() # Get all examples examples = list(client.list_examples(dataset_name="my-dataset")) random.shuffle(examples) # Split train_size = int(0.8 * len(examples)) train_examples = examples[:train_size] test_examples = examples[train_size:] # Create split datasets train_dataset = client.create_dataset("my-dataset-train") test_dataset = client.create_dataset("my-dataset-test") for ex in train_examples: client.create_example(inputs=ex.inputs, outputs=ex.outputs, dataset_id=train_dataset.id) for ex in test_examples: client.create_example(inputs=ex.inputs, outputs=ex.outputs, dataset_id=test_dataset.id) ``` ### Upload from CSV ```python from langsmith import Client client = Client() # Upload CSV directly dataset = client.upload_csv( csv_file="./qa_data.csv", input_keys=["question"], output_keys=["answer"], name="qa-dataset", description="QA pairs from CSV" ) ``` ## Filtering and Querying ### Run Filters ```python from langsmith import Client client = Client() # Complex filters runs = client.list_runs( project_name="production", filter='and(eq(status, "success"), gt(latency, 2.0))', execution_order=1, # Only root runs start_time="2024-01-01T00:00:00Z", end_time="2024-12-31T23:59:59Z" ) # Filter by tags runs = client.list_runs( project_name="production", filter='has(tags, "production")' ) # Filter by error runs = client.list_runs( project_name="production", filter='eq(status, "error")' ) ``` ### Feedback Queries ```python # Get runs with specific feedback runs = client.list_runs( project_name="production", filter='and(eq(feedback_key, "user_rating"), lt(feedback_score, 0.5))' ) # Aggregate feedback from collections import defaultdict feedback_by_key = defaultdict(list) for feedback in client.list_feedback(project_name="production"): feedback_by_key[feedback.key].append(feedback.score) for key, scores in feedback_by_key.items(): print(f"{key}: avg={sum(scores)/len(scores):.2f}, count={len(scores)}") ``` ## OpenTelemetry Integration ```python from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from langsmith import Client # Set up OTel provider = TracerProvider() trace.set_tracer_provider(provider) # Create client with OTel integration client = Client(otel_tracer_provider=provider) # Traces will be exported to both LangSmith and OTel backends ``` ## Multi-Tenant Setup ```python from langsmith import Client # Configure multiple endpoints api_urls = { "https://api-team1.langsmith.com": "api_key_1", "https://api-team2.langsmith.com": "api_key_2" } # Client writes to all endpoints client = Client(api_urls=api_urls) # All operations replicated client.create_run( name="shared_operation", run_type="chain", inputs={"query": "test"} ) ``` ## Batch Operations ```python from langsmith import Client client = Client() # Batch create examples inputs = [{"q": f"Question {i}"} for i in range(1000)] outputs = [{"a": f"Answer {i}"} for i in range(1000)] client.create_examples( inputs=inputs, outputs=outputs, dataset_id=dataset.id ) # Batch update examples example_ids = [ex.id for ex in client.list_examples(dataset_id=dataset.id)] client.update_examples( example_ids=example_ids, metadata=[{"updated": True} for _ in example_ids] ) # Batch delete client.delete_examples(example_ids=example_ids[:100]) ``` ## Caching and Performance ```python from langsmith import Client from functools import lru_cache client = Client() # Cache dataset lookups @lru_cache(maxsize=100) def get_dataset_id(name: str) -> str: dataset = client.read_dataset(dataset_name=name) return str(dataset.id) # Batch tracing for high throughput client = Client(auto_batch_tracing=True) # Control batch size import os os.environ["LANGSMITH_BATCH_SIZE"] = "100" os.environ["LANGSMITH_BATCH_INTERVAL_MS"] = "1000" ```

Source: claude-code-templates (MIT). See About Us for full credits.

BAGUA AI