diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py
index 53a2d48542..cbca48d4fc 100644
--- a/api/oss/src/resources/evaluators/evaluators.py
+++ b/api/oss/src/resources/evaluators/evaluators.py
@@ -229,12 +229,12 @@
"description": "Extract information from the user's response.",
"type": "object",
"properties": {
- "correctness": {
+ "score": {
"type": "boolean",
"description": "The grade results",
}
},
- "required": ["correctness"],
+ "required": ["score"],
"strict": True,
},
},
@@ -264,12 +264,12 @@
"description": "Extract information from the user's response.",
"type": "object",
"properties": {
- "correctness": {
+ "score": {
"type": "boolean",
"description": "The hallucination detection result",
}
},
- "required": ["correctness"],
+ "required": ["score"],
"strict": True,
},
},
@@ -339,12 +339,12 @@
"description": "Extract information from the user's response.",
"type": "object",
"properties": {
- "correctness": {
+ "score": {
"type": "boolean",
"description": "The grade results",
}
},
- "required": ["correctness"],
+ "required": ["score"],
"strict": True,
},
},
diff --git a/api/oss/src/services/converters.py b/api/oss/src/services/converters.py
index 8ead9b7df4..ad9cb64169 100644
--- a/api/oss/src/services/converters.py
+++ b/api/oss/src/services/converters.py
@@ -13,7 +13,6 @@
HumanEvaluationScenario,
EvaluationScenarioOutput,
)
-from oss.src.services import db_manager
from oss.src.models.db_models import (
EvaluationDB,
HumanEvaluationDB,
diff --git a/api/pyproject.toml b/api/pyproject.toml
index 0b922bbfb8..e8c38e1e19 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "api"
-version = "0.62.0"
+version = "0.62.1"
description = "Agenta API"
authors = [
{ name = "Mahmoud Mabrouk", email = "mahmoud@agenta.ai" },
diff --git a/docs/blog/entries/customize-llm-as-a-judge-output-schemas.mdx b/docs/blog/entries/customize-llm-as-a-judge-output-schemas.mdx
new file mode 100644
index 0000000000..033e29371b
--- /dev/null
+++ b/docs/blog/entries/customize-llm-as-a-judge-output-schemas.mdx
@@ -0,0 +1,71 @@
+---
+title: "Customize LLM-as-a-Judge Output Schemas"
+slug: customize-llm-as-a-judge-output-schemas
+date: 2025-11-10
+tags: [v0.62.0]
+description: "Learn how to customize LLM-as-a-Judge evaluator output schemas with binary, multiclass, or custom JSON formats. Enable reasoning for better evaluation quality and structure feedback to match your workflow needs."
+---
+
+import Image from "@theme/IdealImage";
+
+The LLM-as-a-Judge evaluator now supports custom output schemas. You can define exactly what feedback structure you need for your evaluations.
+
+
+
+
+
+
+
+## What's New
+
+### **Flexible Output Types**
+Configure the evaluator to return different types of outputs:
+- **Binary**: Return a simple yes/no or pass/fail score
+- **Multiclass**: Choose from multiple predefined categories
+- **Custom JSON**: Define any structure that fits your use case
+
+### **Include Reasoning for Better Quality**
+Enable the reasoning option to have the LLM explain its evaluation. This improves prediction quality because the model thinks through its assessment before providing a score.
+
+When you include reasoning, the evaluator returns both the score and a detailed explanation of how it arrived at that judgment.
+
+### **Advanced: Raw JSON Schema**
+For complete control, provide a raw JSON schema. The evaluator will return responses that match your exact structure.
+
+This lets you capture multiple scores, categorical labels, confidence levels, and custom fields in a single evaluation pass. You can structure the output however your workflow requires.
+
+### **Use Custom Schemas in Evaluation**
+Once configured, your custom schemas work seamlessly in the evaluation workflow. The results display in the evaluation dashboard with all your custom fields visible.
+
+This makes it easy to analyze multiple dimensions of quality in a single evaluation run.
+
+## Example Use Cases
+
+**Binary Score with Reasoning:**
+Return a simple correct/incorrect judgment along with an explanation of why the output succeeded or failed.
+
+**Multi-dimensional Feedback:**
+Capture separate scores for accuracy, relevance, completeness, and tone in one evaluation. Include reasoning for each dimension.
+
+**Structured Classification:**
+Return categorical labels (excellent/good/fair/poor) along with specific issues found and suggestions for improvement.
+
+## Getting Started
+
+To use custom output schemas with LLM-as-a-Judge:
+
+1. Open the evaluator configuration
+2. Select your desired output type (binary, multiclass, or custom)
+3. Enable reasoning if you want explanations
+4. For advanced use, provide your JSON schema
+5. Run your evaluation
+
+Learn more in the [LLM-as-a-Judge documentation](/evaluation/configure-evaluators/llm-as-a-judge).
diff --git a/docs/blog/main.mdx b/docs/blog/main.mdx
index e55eed8a9c..66a0256cb0 100644
--- a/docs/blog/main.mdx
+++ b/docs/blog/main.mdx
@@ -10,6 +10,33 @@ import Image from "@theme/IdealImage";
+### [Customize LLM-as-a-Judge Output Schemas](/changelog/customize-llm-as-a-judge-output-schemas)
+
+_10 November 2025_
+
+**v0.62.0**
+
+
+
+
+
+
+The LLM-as-a-Judge evaluator now supports custom output schemas. Create multiple feedback outputs per evaluator with any structure you need.
+
+You can configure output types (binary, multiclass), include reasoning to improve prediction quality, or provide a raw JSON schema with any structure you define. Use these custom schemas in your evaluations to capture exactly the feedback you need.
+
+Learn more in the [LLM-as-a-Judge documentation](/evaluation/configure-evaluators/llm-as-a-judge).
+
+---
+
### [Documentation Overhaul](/changelog/documentation-architecture-overhaul)
_3 November 2025_
diff --git a/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx b/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx
index 399dfde99e..a6489b156d 100644
--- a/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx
+++ b/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx
@@ -2,6 +2,8 @@
title: "LLM-as-a-Judge"
---
+import Image from "@theme/IdealImage";
+
LLM-as-a-Judge is an evaluator that uses an LLM to assess LLM outputs. It's particularly useful for evaluating text generation tasks or chatbots where there's no single correct answer.

@@ -56,4 +58,28 @@ ANSWER ONLY THE SCORE. DO NOT USE MARKDOWN. DO NOT PROVIDE ANYTHING OTHER THAN T
### The Model
-The model can be configured to select one of the supported options (`gpt-3.5-turbo`, `gpt-4o`, `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `claude-3-5-sonnet`, `claude-3-5-haiku`, `claude-3-5-opus`). To use LLM-as-a-Judge, you'll need to set your OpenAI or Anthropic API key in the settings. The key is saved locally and only sent to our servers for evaluation—it's not stored there.
+The model can be configured to select one of the supported options (`gpt-4o`, `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `claude-3-5-sonnet`, `claude-3-5-haiku`, `claude-3-5-opus`). To use LLM-as-a-Judge, you'll need to set your OpenAI or Anthropic API key in the settings. The key is saved locally and only sent to our servers for evaluation; it's not stored there.
+
+### Output Schema
+
+You can configure the output schema to control what the LLM evaluator returns. This allows you to get structured feedback tailored to your evaluation needs.
+
+#### Basic Configuration
+
+The basic configuration lets you choose from common output types:
+
+- **Binary**: Returns a simple pass/fail or yes/no judgment
+- **Multiclass**: Returns a classification from a predefined set of categories
+- **Continuous**: Returns a score between a minimum and maximum value
+
+You can also enable **Include Reasoning** to have the evaluator explain its judgment. This option significantly improves the quality of evaluations by making the LLM's decision process transparent.
+
+
+
+
+#### Advanced Configuration
+
+For complete control, you can provide a custom JSON schema. This lets you define any output structure you need. For example, you could return multiple scores, confidence levels, detailed feedback categories, or any combination of fields.
+
+
+
diff --git a/docs/static/images/changelog/changelog-llm-as-a-judge-response-1.png b/docs/static/images/changelog/changelog-llm-as-a-judge-response-1.png
new file mode 100644
index 0000000000..452c1b718a
Binary files /dev/null and b/docs/static/images/changelog/changelog-llm-as-a-judge-response-1.png differ
diff --git a/docs/static/images/changelog/changelog-llm-as-a-judge-response-2.png b/docs/static/images/changelog/changelog-llm-as-a-judge-response-2.png
new file mode 100644
index 0000000000..2a7fa18e42
Binary files /dev/null and b/docs/static/images/changelog/changelog-llm-as-a-judge-response-2.png differ
diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py
index 738392f345..7216761897 100644
--- a/sdk/agenta/sdk/workflows/handlers.py
+++ b/sdk/agenta/sdk/workflows/handlers.py
@@ -511,20 +511,24 @@ def field_match_test_v0(
correct_answer = inputs[correct_answer_key]
if not isinstance(outputs, str) and not isinstance(outputs, dict):
- raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
+ # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
+ return {"success": False}
outputs_dict = outputs
if isinstance(outputs, str):
try:
outputs_dict = loads(outputs)
except json.JSONDecodeError as e:
- raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
+ # raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
+ return {"success": False}
if not isinstance(outputs_dict, dict):
- raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
+ # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
+ return {"success": False}
if not json_field in outputs_dict:
- raise MissingOutputV0Error(path=json_field)
+ # raise MissingOutputV0Error(path=json_field)
+ return {"success": False}
# --------------------------------------------------------------------------
success = outputs_dict[json_field] == correct_answer
diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml
index 47db074d71..00e85df3dc 100644
--- a/sdk/pyproject.toml
+++ b/sdk/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "agenta"
-version = "0.62.0"
+version = "0.62.1"
description = "The SDK for agenta is an open-source LLMOps platform."
readme = "README.md"
authors = [
diff --git a/web/ee/package.json b/web/ee/package.json
index 3a7d0209d2..555d036cf2 100644
--- a/web/ee/package.json
+++ b/web/ee/package.json
@@ -1,6 +1,6 @@
{
"name": "@agenta/ee",
- "version": "0.62.0",
+ "version": "0.62.1",
"private": true,
"engines": {
"node": ">=18"
diff --git a/web/oss/package.json b/web/oss/package.json
index b586042152..e5e89e0037 100644
--- a/web/oss/package.json
+++ b/web/oss/package.json
@@ -1,6 +1,6 @@
{
"name": "@agenta/oss",
- "version": "0.62.0",
+ "version": "0.62.1",
"private": true,
"engines": {
"node": ">=18"
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts
index b6acddb008..a56de11836 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts
@@ -23,15 +23,15 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema {
const {responseFormat, includeReasoning, continuousConfig, categoricalOptions} = config
const properties: Record = {}
- const required: string[] = ["correctness"]
+ const required: string[] = ["score"]
// Base description is always "The grade results"
const baseDescription = "The grade results"
- // Add the main correctness field based on response format
+ // Add the main score field based on response format
switch (responseFormat) {
case "continuous":
- properties.correctness = {
+ properties.score = {
type: "number",
description: baseDescription,
minimum: continuousConfig?.minimum ?? 0,
@@ -40,7 +40,7 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema {
break
case "boolean":
- properties.correctness = {
+ properties.score = {
type: "boolean",
description: baseDescription,
}
@@ -53,14 +53,14 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema {
.map((opt) => `"${opt.name}": ${opt.description}`)
.join("| ")
- properties.correctness = {
+ properties.score = {
type: "string",
description: `${baseDescription}. Categories: ${categoryDescriptions}`,
enum: enumValues,
}
} else {
// Fallback if no categories defined
- properties.correctness = {
+ properties.score = {
type: "string",
description: baseDescription,
}
@@ -97,43 +97,43 @@ export function parseJSONSchema(schemaString: string): SchemaConfig | null {
// Handle both old format (direct schema) and new format (with name wrapper)
const schema = parsed.schema || parsed
- if (!schema.properties || !schema.properties.correctness) {
+ if (!schema.properties || !schema.properties.score) {
return null
}
- const correctness = schema.properties.correctness
+ const score = schema.properties.score
const hasReasoning = !!schema.properties.comment
let responseFormat: SchemaConfig["responseFormat"] = "boolean"
let continuousConfig: SchemaConfig["continuousConfig"]
let categoricalOptions: SchemaConfig["categoricalOptions"]
- if (correctness.type === "number") {
+ if (score.type === "number") {
responseFormat = "continuous"
continuousConfig = {
- minimum: correctness.minimum ?? 0,
- maximum: correctness.maximum ?? 10,
+ minimum: score.minimum ?? 0,
+ maximum: score.maximum ?? 10,
}
- } else if (correctness.type === "boolean") {
+ } else if (score.type === "boolean") {
responseFormat = "boolean"
- } else if (correctness.type === "string" && correctness.enum) {
+ } else if (score.type === "string" && score.enum) {
responseFormat = "categorical"
// Parse category descriptions from the description field
- const desc = correctness.description || ""
+ const desc = score.description || ""
const categoriesMatch = desc.match(/Categories: (.+)/)
if (categoriesMatch) {
const categoriesStr = categoriesMatch[1]
const categoryPairs = categoriesStr.split("| ")
- categoricalOptions = correctness.enum.map((name: string) => {
+ categoricalOptions = score.enum.map((name: string) => {
const pair = categoryPairs.find((p: string) => p.startsWith(`"${name}":`))
const description = pair ? pair.split(": ")[1] || "" : ""
return {name, description}
})
} else {
- categoricalOptions = correctness.enum.map((name: string) => ({
+ categoricalOptions = score.enum.map((name: string) => ({
name,
description: "",
}))
diff --git a/web/package.json b/web/package.json
index f06cb54ce3..c7b4ea6ec9 100644
--- a/web/package.json
+++ b/web/package.json
@@ -1,6 +1,6 @@
{
"name": "agenta-web",
- "version": "0.62.0",
+ "version": "0.62.1",
"workspaces": [
"ee",
"oss",