Evals · JSON Structure

Evals Eval Run Structure

A single graded execution of a system-under-test against an eval case, captured in JSON Structure form for cross-tool interchange of eval results.

Type: object Properties: 17 Required: 7

EvalsLLM EvaluationAI QualityBenchmarksLLM as a JudgeObservabilityAgent EvaluationRAG EvaluationTest-Driven AI

EvalRun is a JSON Structure definition published by Evals, describing 17 properties, of which 7 are required. It conforms to the https://json-structure.org/meta/core/v0/# meta-schema.

Properties

id suite_id case_id experiment_id model prompt input output output_structured expected scorer score label evidence metrics tags timestamp

Meta-schema: https://json-structure.org/meta/core/v0/#

JSON Structure

{
  "$schema": "https://json-structure.org/meta/core/v0/#",
  "$id": "https://raw.githubusercontent.com/api-evangelist/evals/refs/heads/main/json-structure/evals-eval-run-structure.json",
  "name": "EvalRun",
  "description": "A single graded execution of a system-under-test against an eval case, captured in JSON Structure form for cross-tool interchange of eval results.",
  "type": "object",
  "properties": {
    "id": {
      "type": "string",
      "description": "Unique identifier for this eval run record."
    },
    "suite_id": {
      "type": "string",
      "description": "Identifier of the parent eval suite."
    },
    "case_id": {
      "type": "string",
      "description": "Identifier of the eval case executed."
    },
    "experiment_id": {
      "type": "string",
      "description": "Identifier of the experiment grouping a set of runs."
    },
    "model": {
      "type": "object",
      "description": "The model and configuration being evaluated.",
      "properties": {
        "provider": { "type": "string" },
        "name": { "type": "string" },
        "version": { "type": "string" },
        "temperature": { "type": "number" },
        "max_tokens": { "type": "integer" },
        "system_prompt": { "type": "string" }
      }
    },
    "prompt": { "type": "string" },
    "input": { "type": "object" },
    "output": { "type": "string" },
    "output_structured": { "type": "object" },
    "expected": { "type": "string" },
    "scorer": {
      "type": "object",
      "properties": {
        "id": { "type": "string" },
        "name": { "type": "string" },
        "type": {
          "type": "string",
          "enum": [
            "code",
            "llm_judge",
            "human",
            "heuristic",
            "reference_based",
            "reference_free",
            "pairwise"
          ]
        }
      }
    },
    "score": { "type": "number" },
    "label": { "type": "string" },
    "evidence": {
      "type": "object",
      "properties": {
        "rationale": { "type": "string" },
        "judge_model": { "type": "string" },
        "trace_id": { "type": "string" },
        "retrieved_context": {
          "type": "array",
          "items": { "type": "string" }
        }
      }
    },
    "metrics": {
      "type": "object",
      "properties": {
        "latency_ms": { "type": "integer" },
        "input_tokens": { "type": "integer" },
        "output_tokens": { "type": "integer" },
        "cost_usd": { "type": "number" }
      }
    },
    "tags": {
      "type": "array",
      "items": { "type": "string" }
    },
    "timestamp": {
      "type": "string",
      "format": "date-time"
    }
  },
  "required": ["id", "case_id", "model", "output", "scorer", "score", "timestamp"]
}