diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py index 53a2d48542..cbca48d4fc 100644 --- a/api/oss/src/resources/evaluators/evaluators.py +++ b/api/oss/src/resources/evaluators/evaluators.py @@ -229,12 +229,12 @@ "description": "Extract information from the user's response.", "type": "object", "properties": { - "correctness": { + "score": { "type": "boolean", "description": "The grade results", } }, - "required": ["correctness"], + "required": ["score"], "strict": True, }, }, @@ -264,12 +264,12 @@ "description": "Extract information from the user's response.", "type": "object", "properties": { - "correctness": { + "score": { "type": "boolean", "description": "The hallucination detection result", } }, - "required": ["correctness"], + "required": ["score"], "strict": True, }, }, @@ -339,12 +339,12 @@ "description": "Extract information from the user's response.", "type": "object", "properties": { - "correctness": { + "score": { "type": "boolean", "description": "The grade results", } }, - "required": ["correctness"], + "required": ["score"], "strict": True, }, }, diff --git a/api/oss/src/services/converters.py b/api/oss/src/services/converters.py index 8ead9b7df4..ad9cb64169 100644 --- a/api/oss/src/services/converters.py +++ b/api/oss/src/services/converters.py @@ -13,7 +13,6 @@ HumanEvaluationScenario, EvaluationScenarioOutput, ) -from oss.src.services import db_manager from oss.src.models.db_models import ( EvaluationDB, HumanEvaluationDB, diff --git a/api/pyproject.toml b/api/pyproject.toml index 0b922bbfb8..e8c38e1e19 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "api" -version = "0.62.0" +version = "0.62.1" description = "Agenta API" authors = [ { name = "Mahmoud Mabrouk", email = "mahmoud@agenta.ai" }, diff --git a/docs/blog/entries/customize-llm-as-a-judge-output-schemas.mdx b/docs/blog/entries/customize-llm-as-a-judge-output-schemas.mdx new file mode 100644 index 0000000000..033e29371b --- /dev/null +++ b/docs/blog/entries/customize-llm-as-a-judge-output-schemas.mdx @@ -0,0 +1,71 @@ +--- +title: "Customize LLM-as-a-Judge Output Schemas" +slug: customize-llm-as-a-judge-output-schemas +date: 2025-11-10 +tags: [v0.62.0] +description: "Learn how to customize LLM-as-a-Judge evaluator output schemas with binary, multiclass, or custom JSON formats. Enable reasoning for better evaluation quality and structure feedback to match your workflow needs." +--- + +import Image from "@theme/IdealImage"; + +The LLM-as-a-Judge evaluator now supports custom output schemas. You can define exactly what feedback structure you need for your evaluations. + + +
+ Custom output schemas in LLM-as-a-Judge - Example 1 + Custom output schemas in LLM-as-a-Judge - Example 2 +
+ +## What's New + +### **Flexible Output Types** +Configure the evaluator to return different types of outputs: +- **Binary**: Return a simple yes/no or pass/fail score +- **Multiclass**: Choose from multiple predefined categories +- **Custom JSON**: Define any structure that fits your use case + +### **Include Reasoning for Better Quality** +Enable the reasoning option to have the LLM explain its evaluation. This improves prediction quality because the model thinks through its assessment before providing a score. + +When you include reasoning, the evaluator returns both the score and a detailed explanation of how it arrived at that judgment. + +### **Advanced: Raw JSON Schema** +For complete control, provide a raw JSON schema. The evaluator will return responses that match your exact structure. + +This lets you capture multiple scores, categorical labels, confidence levels, and custom fields in a single evaluation pass. You can structure the output however your workflow requires. + +### **Use Custom Schemas in Evaluation** +Once configured, your custom schemas work seamlessly in the evaluation workflow. The results display in the evaluation dashboard with all your custom fields visible. + +This makes it easy to analyze multiple dimensions of quality in a single evaluation run. + +## Example Use Cases + +**Binary Score with Reasoning:** +Return a simple correct/incorrect judgment along with an explanation of why the output succeeded or failed. + +**Multi-dimensional Feedback:** +Capture separate scores for accuracy, relevance, completeness, and tone in one evaluation. Include reasoning for each dimension. + +**Structured Classification:** +Return categorical labels (excellent/good/fair/poor) along with specific issues found and suggestions for improvement. + +## Getting Started + +To use custom output schemas with LLM-as-a-Judge: + +1. Open the evaluator configuration +2. Select your desired output type (binary, multiclass, or custom) +3. Enable reasoning if you want explanations +4. For advanced use, provide your JSON schema +5. Run your evaluation + +Learn more in the [LLM-as-a-Judge documentation](/evaluation/configure-evaluators/llm-as-a-judge). diff --git a/docs/blog/main.mdx b/docs/blog/main.mdx index e55eed8a9c..66a0256cb0 100644 --- a/docs/blog/main.mdx +++ b/docs/blog/main.mdx @@ -10,6 +10,33 @@ import Image from "@theme/IdealImage";
+### [Customize LLM-as-a-Judge Output Schemas](/changelog/customize-llm-as-a-judge-output-schemas) + +_10 November 2025_ + +**v0.62.0** + +
+ Custom output schemas in LLM-as-a-Judge - Example 1 + Custom output schemas in LLM-as-a-Judge - Example 2 +
+ +The LLM-as-a-Judge evaluator now supports custom output schemas. Create multiple feedback outputs per evaluator with any structure you need. + +You can configure output types (binary, multiclass), include reasoning to improve prediction quality, or provide a raw JSON schema with any structure you define. Use these custom schemas in your evaluations to capture exactly the feedback you need. + +Learn more in the [LLM-as-a-Judge documentation](/evaluation/configure-evaluators/llm-as-a-judge). + +--- + ### [Documentation Overhaul](/changelog/documentation-architecture-overhaul) _3 November 2025_ diff --git a/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx b/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx index 399dfde99e..a6489b156d 100644 --- a/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx +++ b/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx @@ -2,6 +2,8 @@ title: "LLM-as-a-Judge" --- +import Image from "@theme/IdealImage"; + LLM-as-a-Judge is an evaluator that uses an LLM to assess LLM outputs. It's particularly useful for evaluating text generation tasks or chatbots where there's no single correct answer. ![Configuration of LLM-as-a-judge](/images/evaluation/configure-evaluators-3.png) @@ -56,4 +58,28 @@ ANSWER ONLY THE SCORE. DO NOT USE MARKDOWN. DO NOT PROVIDE ANYTHING OTHER THAN T ### The Model -The model can be configured to select one of the supported options (`gpt-3.5-turbo`, `gpt-4o`, `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `claude-3-5-sonnet`, `claude-3-5-haiku`, `claude-3-5-opus`). To use LLM-as-a-Judge, you'll need to set your OpenAI or Anthropic API key in the settings. The key is saved locally and only sent to our servers for evaluation—it's not stored there. +The model can be configured to select one of the supported options (`gpt-4o`, `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `claude-3-5-sonnet`, `claude-3-5-haiku`, `claude-3-5-opus`). To use LLM-as-a-Judge, you'll need to set your OpenAI or Anthropic API key in the settings. The key is saved locally and only sent to our servers for evaluation; it's not stored there. + +### Output Schema + +You can configure the output schema to control what the LLM evaluator returns. This allows you to get structured feedback tailored to your evaluation needs. + +#### Basic Configuration + +The basic configuration lets you choose from common output types: + +- **Binary**: Returns a simple pass/fail or yes/no judgment +- **Multiclass**: Returns a classification from a predefined set of categories +- **Continuous**: Returns a score between a minimum and maximum value + +You can also enable **Include Reasoning** to have the evaluator explain its judgment. This option significantly improves the quality of evaluations by making the LLM's decision process transparent. + +Basic output schema configuration + + +#### Advanced Configuration + +For complete control, you can provide a custom JSON schema. This lets you define any output structure you need. For example, you could return multiple scores, confidence levels, detailed feedback categories, or any combination of fields. + + +Advanced output schema configuration diff --git a/docs/static/images/changelog/changelog-llm-as-a-judge-response-1.png b/docs/static/images/changelog/changelog-llm-as-a-judge-response-1.png new file mode 100644 index 0000000000..452c1b718a Binary files /dev/null and b/docs/static/images/changelog/changelog-llm-as-a-judge-response-1.png differ diff --git a/docs/static/images/changelog/changelog-llm-as-a-judge-response-2.png b/docs/static/images/changelog/changelog-llm-as-a-judge-response-2.png new file mode 100644 index 0000000000..2a7fa18e42 Binary files /dev/null and b/docs/static/images/changelog/changelog-llm-as-a-judge-response-2.png differ diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py index 738392f345..7216761897 100644 --- a/sdk/agenta/sdk/workflows/handlers.py +++ b/sdk/agenta/sdk/workflows/handlers.py @@ -511,20 +511,24 @@ def field_match_test_v0( correct_answer = inputs[correct_answer_key] if not isinstance(outputs, str) and not isinstance(outputs, dict): - raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs) + # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs) + return {"success": False} outputs_dict = outputs if isinstance(outputs, str): try: outputs_dict = loads(outputs) except json.JSONDecodeError as e: - raise InvalidOutputsV0Error(expected="dict", got=outputs) from e + # raise InvalidOutputsV0Error(expected="dict", got=outputs) from e + return {"success": False} if not isinstance(outputs_dict, dict): - raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs) + # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs) + return {"success": False} if not json_field in outputs_dict: - raise MissingOutputV0Error(path=json_field) + # raise MissingOutputV0Error(path=json_field) + return {"success": False} # -------------------------------------------------------------------------- success = outputs_dict[json_field] == correct_answer diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index 47db074d71..00e85df3dc 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "agenta" -version = "0.62.0" +version = "0.62.1" description = "The SDK for agenta is an open-source LLMOps platform." readme = "README.md" authors = [ diff --git a/web/ee/package.json b/web/ee/package.json index 3a7d0209d2..555d036cf2 100644 --- a/web/ee/package.json +++ b/web/ee/package.json @@ -1,6 +1,6 @@ { "name": "@agenta/ee", - "version": "0.62.0", + "version": "0.62.1", "private": true, "engines": { "node": ">=18" diff --git a/web/oss/package.json b/web/oss/package.json index b586042152..e5e89e0037 100644 --- a/web/oss/package.json +++ b/web/oss/package.json @@ -1,6 +1,6 @@ { "name": "@agenta/oss", - "version": "0.62.0", + "version": "0.62.1", "private": true, "engines": { "node": ">=18" diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts index b6acddb008..a56de11836 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts @@ -23,15 +23,15 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema { const {responseFormat, includeReasoning, continuousConfig, categoricalOptions} = config const properties: Record = {} - const required: string[] = ["correctness"] + const required: string[] = ["score"] // Base description is always "The grade results" const baseDescription = "The grade results" - // Add the main correctness field based on response format + // Add the main score field based on response format switch (responseFormat) { case "continuous": - properties.correctness = { + properties.score = { type: "number", description: baseDescription, minimum: continuousConfig?.minimum ?? 0, @@ -40,7 +40,7 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema { break case "boolean": - properties.correctness = { + properties.score = { type: "boolean", description: baseDescription, } @@ -53,14 +53,14 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema { .map((opt) => `"${opt.name}": ${opt.description}`) .join("| ") - properties.correctness = { + properties.score = { type: "string", description: `${baseDescription}. Categories: ${categoryDescriptions}`, enum: enumValues, } } else { // Fallback if no categories defined - properties.correctness = { + properties.score = { type: "string", description: baseDescription, } @@ -97,43 +97,43 @@ export function parseJSONSchema(schemaString: string): SchemaConfig | null { // Handle both old format (direct schema) and new format (with name wrapper) const schema = parsed.schema || parsed - if (!schema.properties || !schema.properties.correctness) { + if (!schema.properties || !schema.properties.score) { return null } - const correctness = schema.properties.correctness + const score = schema.properties.score const hasReasoning = !!schema.properties.comment let responseFormat: SchemaConfig["responseFormat"] = "boolean" let continuousConfig: SchemaConfig["continuousConfig"] let categoricalOptions: SchemaConfig["categoricalOptions"] - if (correctness.type === "number") { + if (score.type === "number") { responseFormat = "continuous" continuousConfig = { - minimum: correctness.minimum ?? 0, - maximum: correctness.maximum ?? 10, + minimum: score.minimum ?? 0, + maximum: score.maximum ?? 10, } - } else if (correctness.type === "boolean") { + } else if (score.type === "boolean") { responseFormat = "boolean" - } else if (correctness.type === "string" && correctness.enum) { + } else if (score.type === "string" && score.enum) { responseFormat = "categorical" // Parse category descriptions from the description field - const desc = correctness.description || "" + const desc = score.description || "" const categoriesMatch = desc.match(/Categories: (.+)/) if (categoriesMatch) { const categoriesStr = categoriesMatch[1] const categoryPairs = categoriesStr.split("| ") - categoricalOptions = correctness.enum.map((name: string) => { + categoricalOptions = score.enum.map((name: string) => { const pair = categoryPairs.find((p: string) => p.startsWith(`"${name}":`)) const description = pair ? pair.split(": ")[1] || "" : "" return {name, description} }) } else { - categoricalOptions = correctness.enum.map((name: string) => ({ + categoricalOptions = score.enum.map((name: string) => ({ name, description: "", })) diff --git a/web/package.json b/web/package.json index f06cb54ce3..c7b4ea6ec9 100644 --- a/web/package.json +++ b/web/package.json @@ -1,6 +1,6 @@ { "name": "agenta-web", - "version": "0.62.0", + "version": "0.62.1", "workspaces": [ "ee", "oss",