Interface IEvaluator

Interface for the agent evaluator.

Example

const evaluator = new Evaluator();

// Create test suite
const testCases: EvalTestCase[] = [
{
id: 'greet-1',
name: 'Basic greeting',
input: 'Hello!',
expectedOutput: 'Hello! How can I help you today?',
criteria: [
{ name: 'relevance', description: 'Is greeting appropriate', weight: 0.5, scorer: 'llm_judge' },
{ name: 'politeness', description: 'Is response polite', weight: 0.5, scorer: 'contains' },
],
},
];

// Run evaluation
const run = await evaluator.runEvaluation('greeting-test', testCases, agentFn);
console.log(`Pass rate: ${run.aggregateMetrics.passRate * 100}%`);
interface IEvaluator {
    runEvaluation(name, testCases, agentFn, config?): Promise<EvalRun>;
    evaluateTestCase(testCase, actualOutput, config?): Promise<EvalTestResult>;
    score(scorer, actual, expected?, references?): Promise<number>;
    registerScorer(name, fn): void;
    getRun(runId): Promise<undefined | EvalRun>;
    listRuns(limit?): Promise<EvalRun[]>;
    compareRuns(runId1, runId2): Promise<EvalComparison>;
    generateReport(runId, format): Promise<string>;
}

Implemented by

Methods

  • Runs an evaluation suite against an agent.

    Parameters

    • name: string

      Name for this evaluation run

    • testCases: EvalTestCase[]

      Test cases to evaluate

    • agentFn: ((input, context?) => Promise<string>)

      Function that takes input and returns agent output

        • (input, context?): Promise<string>
        • Parameters

          • input: string
          • Optional context: string

          Returns Promise<string>

    • Optional config: EvalConfig

      Evaluation configuration

    Returns Promise<EvalRun>

    The completed evaluation run

  • Scores output using a specific scorer.

    Parameters

    • scorer: string

      Scorer name

    • actual: string

      Actual output

    • Optional expected: string

      Expected output

    • Optional references: string[]

      Reference outputs

    Returns Promise<number>

    Score (0-1)

  • Generates a report for a run.

    Parameters

    • runId: string

      Run ID

    • format: "json" | "markdown" | "html"

      Report format

    Returns Promise<string>

    Report content