Skip to main content

Basic Usage

This guide walks you through running your first evaluation with the TurnWise SDK.

Step 1: Initialize the Client

import asyncio
from turnwise import TurnWiseClient

async def main():
    client = TurnWiseClient(
        turnwise_api_key="tw_your_key",
        openrouter_api_key="sk-or-your_key"
    )
    
    # Verify connection
    await client.verify()
    print("Connected!")
    
    await client.close()

asyncio.run(main())

Step 2: List Your Datasets

datasets = await client.list_datasets()
for ds in datasets:
    print(f"{ds.id}: {ds.name} ({ds.conversation_count} conversations)")

Step 3: Define a Metric

Create a metric to evaluate your conversations:
from turnwise import Metric, EvaluationLevel, OutputType

metric = Metric(
    name="Response Helpfulness",
    description="Measures how helpful the assistant's response is",
    prompt="""Evaluate how helpful this assistant response is.

USER GOAL: @GOAL
RESPONSE TO EVALUATE: @CURRENT_MESSAGE.output

Consider:
- Does it address the user's need?
- Is the information accurate?
- Is it clear and actionable?

Score from 0.0 to 1.0.""",
    evaluation_level=EvaluationLevel.MESSAGE,
    output_type=OutputType.PROGRESS,
    model_name="openai/gpt-4o-mini",
)

Step 4: Run Evaluation

Run the evaluation on a dataset:
results = await client.evaluate(
    dataset_id=1,
    metric=metric,
    progress_callback=lambda p: print(f"Progress: {p.completed}/{p.total}")
)

Step 5: View Results

Access evaluation results:
# Get scores
scores = [r.get_score() for r in results if r.get_score() is not None]
if scores:
    avg_score = sum(scores) / len(scores)
    print(f"Average score: {avg_score:.3f}")

# View individual results
for result in results[:5]:
    print(f"Entity {result.entity_id}: {result.get_score()}")

Complete Example

import asyncio
from turnwise import TurnWiseClient, Metric, EvaluationLevel, OutputType

async def main():
    # Initialize client
    client = TurnWiseClient(
        turnwise_api_key="tw_your_key",
        openrouter_api_key="sk-or-your_key"
    )
    
    await client.verify()
    
    # List datasets
    datasets = await client.list_datasets()
    if not datasets:
        print("No datasets found")
        return
    
    dataset_id = datasets[0].id
    print(f"Using dataset: {datasets[0].name}")
    
    # Define metric
    metric = Metric(
        name="Helpfulness",
        prompt="""Rate how helpful this response is.

GOAL: @GOAL
RESPONSE: @CURRENT_MESSAGE.output

Score 0-1.""",
        evaluation_level=EvaluationLevel.MESSAGE,
        output_type=OutputType.PROGRESS,
    )
    
    # Run evaluation
    def on_progress(progress):
        print(f"Progress: {progress.completed}/{progress.total}")
    
    results = await client.evaluate(
        dataset_id=dataset_id,
        metric=metric,
        progress_callback=on_progress
    )
    
    # Calculate statistics
    scores = [r.get_score() for r in results if r.get_score() is not None]
    if scores:
        print(f"\nAverage: {sum(scores) / len(scores):.3f}")
        print(f"Min: {min(scores):.3f}")
        print(f"Max: {max(scores):.3f}")
    
    print(f"\nResults synced to TurnWise!")
    print(f"View at: https://app.turnwise.io/datasets/{dataset_id}")
    
    await client.close()

asyncio.run(main())

Evaluation Levels

Choose the right level for your metric:

Conversation Level

Evaluate the entire conversation:
metric = Metric(
    name="Overall Quality",
    prompt="Evaluate the overall quality of this conversation: @HISTORY",
    evaluation_level=EvaluationLevel.CONVERSATION,
    output_type=OutputType.PROGRESS,
)

Message Level

Evaluate each assistant message:
metric = Metric(
    name="Message Helpfulness",
    prompt="Evaluate this response: @CURRENT_MESSAGE.output",
    evaluation_level=EvaluationLevel.MESSAGE,
    output_type=OutputType.PROGRESS,
)

Step Level

Evaluate individual reasoning steps:
metric = Metric(
    name="Tool Usage Quality",
    prompt="Evaluate this tool call: @CURRENT_STEP.tool_call",
    evaluation_level=EvaluationLevel.STEP,
    output_type=OutputType.PROGRESS,
)

Output Types

Progress (0-1 Score)

Most common for quality metrics:
metric = Metric(
    name="Helpfulness",
    prompt="Score 0-1...",
    output_type=OutputType.PROGRESS,
)

Checkbox (Boolean)

For yes/no evaluations:
metric = Metric(
    name="Is Helpful?",
    prompt="Is this response helpful? (yes/no)",
    output_type=OutputType.CHECKBOX,
)

Text

For free-form analysis:
metric = Metric(
    name="Analysis",
    prompt="Analyze this response...",
    output_type=OutputType.TEXT,
)

JSON

For structured multi-dimensional evaluation:
metric = Metric(
    name="Detailed Analysis",
    prompt="Analyze this response...",
    output_type=OutputType.JSON,
    output_schema={
        "type": "object",
        "properties": {
            "helpfulness": {"type": "number"},
            "accuracy": {"type": "number"},
            "issues": {"type": "array", "items": {"type": "string"}}
        },
        "required": ["helpfulness", "accuracy"]
    }
)

Template Variables

Use @VARIABLE syntax to inject conversation context:
metric = Metric(
    name="Context-Aware Helpfulness",
    prompt="""Evaluate this response.

USER GOAL: @GOAL
PREVIOUS MESSAGE: @PREVIOUS_USER_MSG
RESPONSE: @CURRENT_MESSAGE.output
CONVERSATION HISTORY: @HISTORY

Score 0-1.""",
    evaluation_level=EvaluationLevel.MESSAGE,
    output_type=OutputType.PROGRESS,
)
See Template Variables for the complete list.

Next Steps