From e5a0e16f0ff384fd551f0d26226776516cf9d9e7 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 27 Jan 2026 18:55:03 +0100 Subject: [PATCH 1/4] docs: add evaluator playground migration planning workspace --- .../migrate-evaluator-playground/README.md | 40 ++ .../migrate-evaluator-playground/context.md | 57 +++ .../current-system.md | 230 +++++++++ .../migration-options.md | 125 +++++ .../new-endpoints.md | 428 +++++++++++++++++ .../migrate-evaluator-playground/plan.md | 450 ++++++++++++++++++ .../migrate-evaluator-playground/research.md | 211 ++++++++ .../risk-analysis.md | 287 +++++++++++ .../migrate-evaluator-playground/status.md | 134 ++++++ 9 files changed, 1962 insertions(+) create mode 100644 docs/design/migrate-evaluator-playground/README.md create mode 100644 docs/design/migrate-evaluator-playground/context.md create mode 100644 docs/design/migrate-evaluator-playground/current-system.md create mode 100644 docs/design/migrate-evaluator-playground/migration-options.md create mode 100644 docs/design/migrate-evaluator-playground/new-endpoints.md create mode 100644 docs/design/migrate-evaluator-playground/plan.md create mode 100644 docs/design/migrate-evaluator-playground/research.md create mode 100644 docs/design/migrate-evaluator-playground/risk-analysis.md create mode 100644 docs/design/migrate-evaluator-playground/status.md diff --git a/docs/design/migrate-evaluator-playground/README.md b/docs/design/migrate-evaluator-playground/README.md new file mode 100644 index 0000000000..4197c667d1 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/README.md @@ -0,0 +1,40 @@ +# Migrate Evaluator Playground to New Evaluator Endpoints + +## Overview + +This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model, and has created backward-compatible legacy endpoints. This migration will update the frontend to use the new endpoints directly. + +## Context + +- **PR #3527**: Backend migration that introduces new evaluator endpoints while keeping legacy endpoints for backward compatibility +- **Goal**: Migrate the Evaluator Playground frontend to use new endpoints, improving consistency with the new workflow-based architecture + +## Documents + +| File | Description | +|------|-------------| +| [context.md](./context.md) | Background, motivation, problem statement, goals, and non-goals | +| [current-system.md](./current-system.md) | Detailed map of current Evaluator Playground implementation | +| [new-endpoints.md](./new-endpoints.md) | New evaluator endpoint shapes and differences from legacy | +| [research.md](./research.md) | Deep dive into evaluator execution architecture and URI-based handlers | +| [migration-options.md](./migration-options.md) | Migration plan options: direct vs transitional approaches | +| [risk-analysis.md](./risk-analysis.md) | Coupling points and risk areas for the migration | +| [plan.md](./plan.md) | Migration execution plan with phases and milestones | +| [status.md](./status.md) | Living document for progress updates and decisions | + +## Key Files Affected + +### Frontend - Core Components +- `web/oss/src/components/Evaluators/` - Evaluators registry +- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/` - Playground UI +- `web/oss/src/services/evaluators/index.ts` - API service layer +- `web/oss/src/services/evaluations/api_ee/index.ts` - Evaluator run execution + +### Frontend - State Management +- `web/oss/src/state/evaluators/atoms.ts` - Evaluator query atoms +- `web/oss/src/lib/atoms/evaluation.ts` - Legacy evaluation atoms + +### Backend Reference (PR #3527) +- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept for backward compatibility) +- `api/oss/src/apis/fastapi/evaluators/router.py` - New `SimpleEvaluators` router +- `api/oss/src/core/evaluators/dtos.py` - New data transfer objects diff --git a/docs/design/migrate-evaluator-playground/context.md b/docs/design/migrate-evaluator-playground/context.md new file mode 100644 index 0000000000..4f2fdae9be --- /dev/null +++ b/docs/design/migrate-evaluator-playground/context.md @@ -0,0 +1,57 @@ +# Context: Migrate Evaluator Playground + +## Background + +The Agenta platform has undergone a significant architectural change where **evaluators are now workflows**. This means evaluators follow the same git-like versioning model as other workflows: +- **Artifact** (Evaluator) → **Variant** → **Revision** + +Previously, evaluators were stored in a flat `EvaluatorConfigDB` table with simple key-value settings. The new model stores evaluators as `WorkflowArtifactDBE`, `WorkflowVariantDBE`, and `WorkflowRevisionDBE` records with richer metadata and versioning. + +## Motivation + +1. **Unified Architecture**: Evaluators, testsets, and apps now share the same git-like workflow model +2. **Better Versioning**: Evaluators can have multiple variants and revision history +3. **Richer Metadata**: New model supports URIs, schemas, scripts, and configuration in a structured way +4. **Future Extensibility**: Custom evaluators will be first-class citizens with the same capabilities as built-in ones + +## Problem Statement + +The Evaluator Playground frontend currently uses legacy endpoints: +- `GET /evaluators/` - List evaluator templates +- `GET/POST/PUT/DELETE /evaluators/configs/` - CRUD for evaluator configurations +- `POST /evaluators/{key}/run/` - Run evaluator in playground + +The backend (PR #3527) has: +1. Migrated all evaluator configs to the new workflow-based model via DB migrations +2. Created new `SimpleEvaluators` endpoints at `/preview/simple/evaluators/` +3. Kept legacy endpoints as thin wrappers that convert new model back to legacy format + +**The frontend needs to migrate to use the new endpoints directly.** + +## Goals + +1. **Replace legacy evaluator config CRUD** with new `SimpleEvaluator` endpoints +2. **Update data models** in frontend to match new `SimpleEvaluator` shape +3. **Maintain backward compatibility** during transition (feature flag or gradual rollout) +4. **Keep the evaluator run endpoint** (`/evaluators/{key}/run/`) - this remains unchanged +5. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality + +## Non-Goals + +1. **Not migrating the evaluator run endpoint** - The `/evaluators/{key}/run/` endpoint is still used and works the same way +2. **Not changing the Evaluator Playground UI** - Only the data layer changes +3. **Not migrating evaluation batch runs** - Those use evaluator revision IDs which are handled by the backend migration +4. **Not introducing new evaluator features** - This is a pure backend migration + +## Success Criteria + +1. Evaluator Playground can create, edit, delete evaluators using new endpoints +2. All existing evaluator configurations continue to work +3. No regression in evaluator testing functionality +4. Clean removal of legacy endpoint usage in frontend + +## Constraints + +1. Must not break existing evaluator configurations +2. Must coordinate with backend team on endpoint availability +3. Should be deployable incrementally (not big-bang) diff --git a/docs/design/migrate-evaluator-playground/current-system.md b/docs/design/migrate-evaluator-playground/current-system.md new file mode 100644 index 0000000000..7797d76ec4 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/current-system.md @@ -0,0 +1,230 @@ +# Current System: Evaluator Playground + +## Overview + +The Evaluator Playground allows users to: +1. **Browse** evaluator templates (built-in evaluators) +2. **Create/Configure** evaluator configurations with custom settings +3. **Test** evaluators by running them against app variants and test cases +4. **Manage** (edit, clone, delete) existing evaluator configurations + +## File Structure + +### Entry Points (Pages) + +| Path | Purpose | +|------|---------| +| `/web/oss/src/pages/w/[workspace_id]/p/[project_id]/evaluators/index.tsx` | Evaluators list page | +| `/web/oss/src/pages/w/[workspace_id]/p/[project_id]/evaluators/configure/[evaluator_id].tsx` | Configure evaluator page | + +### Core Components + +#### Evaluators Registry (`/web/oss/src/components/Evaluators/`) + +| File | Purpose | +|------|---------| +| `index.tsx` | Main registry with table, search, tabs (automatic/human) | +| `hooks/useEvaluatorsRegistryData.ts` | Fetches and transforms evaluator data | +| `assets/getColumns.tsx` | Table column definitions | +| `components/SelectEvaluatorModal/` | Modal to select evaluator template for new config | +| `components/ConfigureEvaluator/index.tsx` | Page wrapper that loads data and initializes atoms | +| `components/DeleteEvaluatorsModal/` | Delete confirmation modal | + +#### ConfigureEvaluator (Main UI) + +Location: `/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/` + +| File | Purpose | +|------|---------| +| `index.tsx` | Configuration form + test panel layout | +| `DebugSection.tsx` | Test evaluator panel (run variant, run evaluator) | +| `DynamicFormField.tsx` | Renders settings fields based on evaluator template | +| `AdvancedSettings.tsx` | Collapsible advanced parameters | +| `state/atoms.ts` | Jotai atoms for playground state | +| `variantUtils.ts` | Utility for building variants from revisions | + +### State Management + +#### Playground Atoms (`state/atoms.ts`) + +```typescript +// Session state +playgroundSessionAtom // { evaluator, existingConfigId, mode } +playgroundEvaluatorAtom // Current evaluator template (derived) +playgroundIsEditModeAtom // Is editing existing config? (derived) +playgroundIsCloneModeAtom // Is cloning config? (derived) +playgroundEditValuesAtom // Current config values being edited + +// Form state +playgroundFormRefAtom // Ant Design Form instance + +// Test section state +playgroundSelectedVariantAtom // Selected variant for testing +playgroundSelectedTestsetIdAtom // Selected testset ID +playgroundSelectedRevisionIdAtom // Selected revision ID +playgroundSelectedTestcaseAtom // Testcase data +playgroundTraceTreeAtom // Trace output from running variant + +// Persisted state (localStorage) +playgroundLastAppIdAtom // Last used app ID +playgroundLastVariantIdAtom // Last used variant ID + +// Action atoms +initPlaygroundAtom // Initialize playground state +resetPlaygroundAtom // Reset all state +commitPlaygroundAtom // Update state after save +cloneCurrentConfigAtom // Switch to clone mode +``` + +#### Global Evaluator Atoms (`/web/oss/src/state/evaluators/atoms.ts`) + +```typescript +evaluatorConfigsQueryAtomFamily // Query for evaluator configs +evaluatorsQueryAtomFamily // Query for evaluator templates +nonArchivedEvaluatorsAtom // Derived: non-archived evaluators +evaluatorByKeyAtomFamily // Find evaluator by key +``` + +### API Service Layer + +#### Evaluators Service (`/web/oss/src/services/evaluators/index.ts`) + +```typescript +// Evaluator Templates (legacy) +fetchAllEvaluators() // GET /evaluators + +// Evaluator Configs (legacy) +fetchAllEvaluatorConfigs() // GET /evaluators/configs +createEvaluatorConfig() // POST /evaluators/configs +updateEvaluatorConfig() // PUT /evaluators/configs/{id} +deleteEvaluatorConfig() // DELETE /evaluators/configs/{id} + +// Custom/Human Evaluators (new) +createEvaluator() // POST /preview/simple/evaluators/ +updateEvaluator() // PUT /preview/simple/evaluators/{id} +fetchEvaluatorById() // GET /preview/simple/evaluators/{id} +deleteHumanEvaluator() // POST /preview/simple/evaluators/{id}/archive +``` + +#### Evaluator Run Service (`/web/oss/src/services/evaluations/api_ee/index.ts`) + +```typescript +createEvaluatorDataMapping() // POST /evaluators/map +createEvaluatorRunExecution() // POST /evaluators/{key}/run +``` + +## Data Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ USER ACTIONS │ +│ - Browse evaluators list │ +│ - Create new evaluator config │ +│ - Edit existing evaluator config │ +│ - Test evaluator with variant + testcase │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ENTRY POINTS │ +│ /evaluators → EvaluatorsRegistry │ +│ ├─ Uses useEvaluatorsRegistryData() hook │ +│ │ ├─ Calls fetchAllEvaluators() → GET /evaluators │ +│ │ └─ Calls fetchAllEvaluatorConfigs() → GET /evaluators/configs │ +│ │ │ +│ ├─ "Create new" → SelectEvaluatorModal → /evaluators/configure/new │ +│ └─ Click row → /evaluators/configure/{id} │ +│ │ +│ /evaluators/configure/{id} → ConfigureEvaluatorPage │ +│ ├─ Loads evaluator template & existing config │ +│ ├─ Initializes playgroundSessionAtom │ +│ └─ Renders ConfigureEvaluator component │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ConfigureEvaluator │ +│ ┌─────────────────────────────┐ ┌─────────────────────────────┐ │ +│ │ LEFT: Configuration Form │ │ RIGHT: DebugSection │ │ +│ │ - Name input │ │ - Testcase selector │ │ +│ │ - DynamicFormField[] │ │ - Variant selector │ │ +│ │ - AdvancedSettings │ │ - Run variant button │ │ +│ │ - Commit/Reset buttons │ │ - Run evaluator button │ │ +│ └─────────────────────────────┘ └─────────────────────────────┘ │ +│ │ +│ Commit Actions: │ +│ - Create: POST /evaluators/configs → createEvaluatorConfig() │ +│ - Update: PUT /evaluators/configs/{id} → updateEvaluatorConfig() │ +│ │ +│ Test Actions: │ +│ - Run Variant: callVariant() → POST to variant URL │ +│ - Run Evaluator: createEvaluatorRunExecution() │ +│ → POST /evaluators/{key}/run │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## Current API Endpoints Used + +### Legacy Endpoints (to be migrated) + +| Endpoint | Method | Frontend Function | Purpose | +|----------|--------|-------------------|---------| +| `/evaluators/` | GET | `fetchAllEvaluators()` | List evaluator templates | +| `/evaluators/configs/` | GET | `fetchAllEvaluatorConfigs()` | List evaluator configs | +| `/evaluators/configs/` | POST | `createEvaluatorConfig()` | Create new config | +| `/evaluators/configs/{id}/` | PUT | `updateEvaluatorConfig()` | Update existing config | +| `/evaluators/configs/{id}/` | DELETE | `deleteEvaluatorConfig()` | Delete config | + +### Endpoints That Remain Unchanged + +| Endpoint | Method | Frontend Function | Purpose | +|----------|--------|-------------------|---------| +| `/evaluators/map/` | POST | `createEvaluatorDataMapping()` | Map trace data for RAG evaluators | +| `/evaluators/{key}/run/` | POST | `createEvaluatorRunExecution()` | Run evaluator (test) | + +### Already Using New Endpoints (for custom evaluators) + +| Endpoint | Method | Frontend Function | Purpose | +|----------|--------|-------------------|---------| +| `/preview/simple/evaluators/` | POST | `createEvaluator()` | Create custom evaluator | +| `/preview/simple/evaluators/{id}` | PUT | `updateEvaluator()` | Update custom evaluator | +| `/preview/simple/evaluators/{id}` | GET | `fetchEvaluatorById()` | Fetch evaluator by ID | +| `/preview/simple/evaluators/{id}/archive` | POST | `deleteHumanEvaluator()` | Archive human evaluator | + +## Data Types + +### Current EvaluatorConfig (Legacy) + +```typescript +interface EvaluatorConfig { + id: string + evaluator_key: string + name: string + settings_values: Record + created_at: string + updated_at: string + color?: string + tags?: string[] + // Frontend additions + icon_url?: string | StaticImageData +} +``` + +### Current Evaluator Template (Legacy) + +```typescript +interface Evaluator { + name: string + key: string + settings_presets?: SettingsPreset[] + settings_template: Record + icon_url?: string | StaticImageData + color?: string + direct_use?: boolean + description: string + oss?: boolean + requires_llm_api_keys?: boolean + tags: string[] + archived?: boolean +} +``` diff --git a/docs/design/migrate-evaluator-playground/migration-options.md b/docs/design/migrate-evaluator-playground/migration-options.md new file mode 100644 index 0000000000..6ea44db0de --- /dev/null +++ b/docs/design/migrate-evaluator-playground/migration-options.md @@ -0,0 +1,125 @@ +# Migration Options (Plan A vs Plan B) + +## Goal + +Full migration of the Evaluator Playground to the new workflow-based evaluator APIs, including: +- CRUD on evaluator configs via `/preview/simple/evaluators/*` (or the richer `/preview/evaluators/*` family) +- Running evaluators via native workflow invocation (`/preview/workflows/invoke`) instead of the legacy `/evaluators/{key}/run` + +This doc lists two concrete migration strategies. + +--- + +## Plan A (Transitional): Keep Internal Shapes Stable + +This is the earlier approach: keep the UI/state assuming the legacy `EvaluatorConfig` shape and translate at the API boundary. + +### Why it exists + +- Minimizes touching UI/atoms/forms +- Lets you swap endpoints quickly with limited regression surface +- Good when backend is still stabilizing schemas + +### Trade-offs + +- Adds an extra abstraction layer (adapters) +- Can delay paying down legacy assumptions (`settings_values`, `evaluator_key`, etc.) + +--- + +## Plan B (Preferred): Direct Migration (No Adapters) + +This changes the frontend domain model to match the backend reality: +- “Evaluator config” becomes `SimpleEvaluator` (workflow artifact w/ latest evaluator revision data attached) +- Execution uses workflow invocation (`/preview/workflows/invoke`) using evaluator `data.uri` + +### Why it’s better long-term + +- Eliminates translation debt +- Aligns with “evaluators are workflows” concept end-to-end +- Unlocks revision-aware runs and custom evaluator URIs + +### Initial Scope (not exhaustive) + +#### 1) Data model and type changes + +- Introduce TS types for `SimpleEvaluator*` and `WorkflowService*` (request/response) +- Replace usages of `EvaluatorConfig` in the evaluator playground path with `SimpleEvaluator` + +Key places: +- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts` +- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx` +- `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` + +#### 2) CRUD endpoint swap (configs) + +Replace: +- `GET/POST/PUT/DELETE /evaluators/configs/*` + +With: +- `POST /preview/simple/evaluators/query` +- `POST /preview/simple/evaluators/` +- `PUT /preview/simple/evaluators/{id}` +- `POST /preview/simple/evaluators/{id}/archive` + +Key files: +- `web/oss/src/services/evaluators/index.ts` +- `web/oss/src/state/evaluators/atoms.ts` + +Notes: +- `evaluator_key` is now derived from `simpleEvaluator.data.uri` (or carried separately in UI state) +- Settings are now `simpleEvaluator.data.parameters` + +#### 3) Run endpoint swap (native invoke) + +Replace: +- `POST /evaluators/{evaluator_key}/run` + +With: +- `POST /preview/workflows/invoke` + +What needs changing in the playground: +- `DebugSection.tsx` currently uses `createEvaluatorRunExecution(evaluatorKey, {inputs, settings})` +- New call should construct `WorkflowServiceRequest`: + - `interface.uri` (or `configuration`+`interface`) derived from evaluator `data` / built-in key + - `data.inputs` (merged testcase + prediction) + - `data.outputs` (prediction/output) + - `data.parameters` (settings) + +Key file: +- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx` +- plus a new service client, e.g. `web/oss/src/services/workflows/invoke.ts` + +#### 4) Registry/list UI adjustments + +The evaluators registry table expects legacy `evaluator_key` and `settings_values`. Under Plan B: +- The list source becomes `SimpleEvaluator[]` +- Table columns need to read from `data.uri` and `data.parameters` + +Key files: +- `web/oss/src/components/Evaluators/index.tsx` +- `web/oss/src/components/Evaluators/assets/getColumns.tsx` +- `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` + +#### 5) Permissions and error handling + +Native invoke uses `RUN_WORKFLOWS` permission (backend check). Expect: +- Different 403 behavior for some users +- Different error shape: workflow service returns `status.code/message` in response + +UI needs: +- Map workflow error status to `message.error` and output editor + +--- + +## Practical Recommendation + +If the objective is “duplicate all endpoints and fully migrate”, Plan B is the right destination. + +To reduce risk while still avoiding adapters, a pragmatic sequencing is: + +1) Migrate CRUD to SimpleEvaluator endpoints (Plan B) +2) Keep legacy run for 1-2 PRs while CRUD stabilizes +3) Migrate run to `/preview/workflows/invoke` (Plan B completion) + +This keeps changes reviewable without introducing a permanent adapter layer. diff --git a/docs/design/migrate-evaluator-playground/new-endpoints.md b/docs/design/migrate-evaluator-playground/new-endpoints.md new file mode 100644 index 0000000000..05231c4813 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/new-endpoints.md @@ -0,0 +1,428 @@ +# New Evaluator Endpoints + +## Overview + +The new evaluator system treats evaluators as **workflows** with git-like versioning. The `SimpleEvaluator` API provides a simplified interface that abstracts the underlying workflow structure. + +## Key Architectural Change + +**Evaluators are now workflows identified by URIs.** + +URI Format: `agenta:builtin:{evaluator_key}:v0` + +Example: `agenta:builtin:auto_exact_match:v0` + +The SDK has a `HANDLER_REGISTRY` that maps URIs to actual handler functions. This enables: +- Native workflow invocation via URI +- Custom evaluators with user-defined URIs (`user:custom:my_evaluator:latest`) +- Version management of evaluator implementations + +## Evaluator Execution Paths + +### Option 1: Legacy Run Endpoint (Maintained for Backward Compatibility) + +``` +POST /evaluators/{evaluator_key}/run/ +``` + +**Request:** +```typescript +interface EvaluatorInputInterface { + inputs: Record // prediction, ground_truth, etc. + settings: Record // evaluator configuration + credentials?: Record +} +``` + +**Response:** +```typescript +interface EvaluatorOutputInterface { + outputs: Record // score, success, etc. +} +``` + +**Internal Implementation (PR #3527):** +```python +async def _run_evaluator(evaluator_key: str, evaluator_input): + # Build URI from evaluator_key + uri = f"agenta:builtin:{evaluator_key}:v0" + + # Retrieve handler from SDK registry + handler = retrieve_handler(uri) + + # Invoke handler directly + result = handler(inputs=inputs, outputs=outputs, parameters=settings) + + return {"outputs": result} +``` + +### Option 2: Native Workflow Invoke Endpoint + +``` +POST /preview/workflows/invoke +``` + +**Request:** +```typescript +interface WorkflowServiceRequest { + data: { + inputs: Record + outputs?: any + parameters?: Record // settings + } + revision?: { + data?: { + uri: string // e.g., "agenta:builtin:auto_exact_match:v0" + parameters?: Record + } + } +} +``` + +**Response:** +```typescript +interface WorkflowServiceBatchResponse { + data: { + outputs: Record + } + status?: { + code: number + message: string + } +} +``` + +### Option 3: Evaluator Revision-Based Invoke + +For a fully "native" approach: + +1. **Fetch the evaluator revision:** + ``` + POST /preview/evaluators/revisions/retrieve + ``` + +2. **Get the URI from revision data:** + ```typescript + const uri = evaluatorRevision.data.uri // "agenta:builtin:auto_exact_match:v0" + ``` + +3. **Invoke via workflow service:** + ``` + POST /preview/workflows/invoke + ``` + +## Comparison: Which Approach to Use? + +| Aspect | Legacy Run | Native Invoke | Revision-Based | +|--------|------------|---------------|----------------| +| **Simplicity** | High | Medium | Low | +| **Frontend Changes** | Minimal | Medium | Significant | +| **Architecture Alignment** | Legacy | Native | Most Native | +| **Flexibility** | Low | High | High | +| **Custom Evaluators** | Limited | Full Support | Full Support | +| **Requires URI** | No (uses key) | Yes | Yes (fetched) | + +**Recommendation:** + +For the Evaluator Playground migration: +- **Short-term:** Keep using legacy `/evaluators/{key}/run/` - it works the same and the backend handles URI resolution internally +- **Long-term:** Consider migrating to native workflow invoke when supporting custom evaluators or revision-specific execution + +--- + +## New SimpleEvaluator CRUD Endpoints + +Base path: `/preview/simple/evaluators/` + +| Endpoint | Method | Purpose | +|----------|--------|---------| +| `/preview/simple/evaluators/` | POST | Create new evaluator | +| `/preview/simple/evaluators/{id}` | GET | Fetch evaluator by ID | +| `/preview/simple/evaluators/{id}` | PUT | Update evaluator | +| `/preview/simple/evaluators/{id}/archive` | POST | Archive (soft delete) evaluator | +| `/preview/simple/evaluators/{id}/unarchive` | POST | Restore archived evaluator | +| `/preview/simple/evaluators/query` | POST | Query evaluators with filters | + +## Data Structures + +### SimpleEvaluator (Response) + +```python +class SimpleEvaluator: + id: UUID + slug: str + + # Lifecycle + created_at: datetime + updated_at: datetime + + # Header + name: Optional[str] + description: Optional[str] + + # Metadata + tags: Optional[List[str]] + meta: Optional[dict] + + # Flags + flags: Optional[SimpleEvaluatorFlags] + + # Data (revision data) + data: Optional[SimpleEvaluatorData] +``` + +### SimpleEvaluatorData (Revision Configuration) + +```python +class SimpleEvaluatorData: + # Version + version: Optional[str] # e.g., "2025.07.14" + + # Service Interface - THE KEY FIELD + uri: Optional[str] # e.g., "agenta:builtin:auto_exact_match:v0" + url: Optional[str] # For webhook evaluators + headers: Optional[Dict[str, Union[Reference, str]]] + + # Schema definitions + schemas: Optional[Dict[str, Schema]] # e.g., {"outputs": {...}} + + # Configuration + script: Optional[dict] # For custom code: {"content": "...", "runtime": "python"} + parameters: Optional[dict] # Settings values (same as legacy settings_values) + + # Legacy fields (for backward compatibility) + service: Optional[dict] + configuration: Optional[dict] +``` + +### URI-based Handler Registry + +The SDK maintains registries that map URIs to implementations: + +```python +HANDLER_REGISTRY = { + "agenta": { + "builtin": { + "echo": {"v0": echo_v0}, + "auto_exact_match": {"v0": auto_exact_match_v0}, + "auto_regex_test": {"v0": auto_regex_test_v0}, + # ... all built-in evaluators + } + }, + "user": { + "custom": { + # User-defined evaluators go here + } + } +} +``` + +Retrieve handler by URI: +```python +handler = retrieve_handler("agenta:builtin:auto_exact_match:v0") +``` + +--- + +## Endpoint Comparison: Old vs New (CRUD) + +### List Evaluator Configs + +**Old:** +``` +GET /evaluators/configs/?project_id={project_id} + +Response: EvaluatorConfig[] +{ + id: string + name: string + evaluator_key: string + settings_values: object + created_at: string + updated_at: string +} +``` + +**New:** +``` +POST /preview/simple/evaluators/query?project_id={project_id} + +Request: SimpleEvaluatorQuery +{ + flags?: { is_evaluator: true } +} + +Response: SimpleEvaluatorsResponse +{ + count: number + evaluators: SimpleEvaluator[] +} +``` + +### Create Evaluator Config + +**Old:** +``` +POST /evaluators/configs/?project_id={project_id} + +Request: NewEvaluatorConfig +{ + name: string + evaluator_key: string + settings_values: object +} + +Response: EvaluatorConfig +``` + +**New:** +``` +POST /preview/simple/evaluators/?project_id={project_id} + +Request: SimpleEvaluatorCreateRequest +{ + evaluator: { + slug: string # Generated from name + name: string + flags: { is_evaluator: true } + data: { + uri: "agenta:builtin:{evaluator_key}:v0" + parameters: object # settings_values + schemas: { outputs: object } # Output schema + } + } +} + +Response: SimpleEvaluatorResponse +{ + count: number + evaluator: SimpleEvaluator +} +``` + +### Update Evaluator Config + +**Old:** +``` +PUT /evaluators/configs/{id}/?project_id={project_id} + +Request: UpdateEvaluatorConfig +{ + name?: string + settings_values?: object +} + +Response: EvaluatorConfig +``` + +**New:** +``` +PUT /preview/simple/evaluators/{id}?project_id={project_id} + +Request: SimpleEvaluatorEditRequest +{ + evaluator: { + id: UUID + name?: string + data?: { + parameters?: object # settings_values + } + } +} + +Response: SimpleEvaluatorResponse +``` + +### Delete Evaluator Config + +**Old:** +``` +DELETE /evaluators/configs/{id}/?project_id={project_id} + +Response: boolean +``` + +**New:** +``` +POST /preview/simple/evaluators/{id}/archive?project_id={project_id} + +Response: SimpleEvaluatorResponse +``` + +--- + +## Key Differences Summary + +### 1. URI-based Evaluator Identification + +**Old:** `evaluator_key: "auto_exact_match"` + +**New:** `uri: "agenta:builtin:auto_exact_match:v0"` + +The URI enables: +- Version management (`v0`, `v1`, etc.) +- Custom evaluators (`user:custom:my_eval:latest`) +- Handler registry lookup + +### 2. Settings Location + +**Old:** `settings_values: { threshold: 0.5 }` + +**New:** `data.parameters: { threshold: 0.5 }` + +### 3. Output Schema (New) + +The new model includes explicit output schemas: + +```python +data.schemas = { + "outputs": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "score": {"type": "number"}, + "success": {"type": "boolean"} + } + } +} +``` + +### 4. Soft Delete vs Hard Delete + +- **Old:** Hard delete (`DELETE`) +- **New:** Soft delete via archive (`POST .../archive`) + +### 5. Response Wrapper + +**Old:** Returns data directly + +**New:** Returns wrapped response: `{ count: number, evaluator: SimpleEvaluator }` + +--- + +## Frontend Mapping Requirements + +To migrate, the frontend needs to: + +1. **When creating an evaluator:** + - Generate `slug` from name + - Build `uri` from `evaluator_key`: `"agenta:builtin:{evaluator_key}:v0"` + - Move `settings_values` to `data.parameters` + - Set `flags.is_evaluator = true` + - Optionally include `data.schemas.outputs` + +2. **When reading evaluators:** + - Extract `evaluator_key` from `uri` (parse the third segment) + - Read settings from `data.parameters` + - Unwrap response from `{ evaluator: ... }` + +3. **When updating:** + - Include `id` in request body + - Update `data.parameters` for settings changes + +4. **When deleting:** + - Use `POST .../archive` instead of `DELETE` + +5. **When running evaluators:** + - **Option A (Recommended):** Keep using `/evaluators/{key}/run/` - no change needed + - **Option B (Native):** Use `/preview/workflows/invoke` with URI from revision diff --git a/docs/design/migrate-evaluator-playground/plan.md b/docs/design/migrate-evaluator-playground/plan.md new file mode 100644 index 0000000000..35a1f9eb4a --- /dev/null +++ b/docs/design/migrate-evaluator-playground/plan.md @@ -0,0 +1,450 @@ +# Migration Plan: Evaluator Playground + +## Overview + +This plan outlines an incremental migration approach that minimizes risk and allows for gradual rollout. The key principle is **transform at boundaries** - keep internal data shapes stable and only change API interactions. + +## Migration Strategy + +Two viable strategies exist: + +- Plan A (transitional): adapter pattern, keep internal legacy `EvaluatorConfig` shape +- Plan B (preferred destination): direct migration, internal shapes become `SimpleEvaluator` + native invoke + +This file documents Plan A as the low-risk execution plan. For the direct plan, see `docs/design/migrate-evaluator-playground/migration-options.md`. + +## Plan A: Adapter Pattern + +Instead of changing data shapes throughout the codebase, we'll: +1. Create adapter functions at the API boundary +2. New endpoints return `SimpleEvaluator`, adapters convert to internal `EvaluatorConfig` shape +3. Internal components continue working unchanged +4. Gradually update internals later (optional) + +``` +┌──────────────┐ ┌──────────────┐ ┌──────────────────┐ +│ New API │ ──► │ Adapter │ ──► │ Internal Shape │ +│ Endpoints │ │ Layer │ │ (unchanged) │ +└──────────────┘ └──────────────┘ └──────────────────┘ +``` + +--- + +## Phase 1: Foundation (Low Risk) + +**Goal:** Create adapter layer and new service functions without changing existing code + +### Tasks + +#### 1.1 Create Type Definitions + +**File:** `web/oss/src/lib/Types.ts` or new file `web/oss/src/services/evaluators/types.ts` + +```typescript +// New API types +interface SimpleEvaluatorData { + version?: string + uri?: string + url?: string + headers?: Record + schemas?: { outputs?: Record } + script?: { content: string; runtime: string } + parameters?: Record +} + +interface SimpleEvaluatorFlags { + is_custom?: boolean + is_evaluator?: boolean + is_human?: boolean +} + +interface SimpleEvaluator { + id: string + slug: string + name?: string + description?: string + tags?: string[] + meta?: Record + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData + created_at: string + updated_at: string +} + +interface SimpleEvaluatorResponse { + count: number + evaluator: SimpleEvaluator | null +} + +interface SimpleEvaluatorsResponse { + count: number + evaluators: SimpleEvaluator[] +} +``` + +#### 1.2 Create Adapter Functions + +**File:** `web/oss/src/services/evaluators/adapters.ts` + +```typescript +import { EvaluatorConfig } from "@/oss/lib/Types" +import { SimpleEvaluator, SimpleEvaluatorData } from "./types" +import { getTagColors } from "@/oss/lib/helpers/colors" +import { stringToNumberInRange } from "@/oss/lib/helpers/utils" + +/** + * Extract evaluator_key from URI + * URI format: "agenta:builtin:{key}:v0" + */ +export function extractEvaluatorKey(uri: string | undefined): string { + if (!uri) return "" + const parts = uri.split(":") + if (parts.length >= 3 && parts[0] === "agenta" && parts[1] === "builtin") { + return parts[2] + } + return "" +} + +/** + * Build URI from evaluator key + */ +export function buildEvaluatorUri(evaluatorKey: string): string { + return `agenta:builtin:${evaluatorKey}:v0` +} + +/** + * Convert SimpleEvaluator to internal EvaluatorConfig shape + */ +export function simpleEvaluatorToConfig( + simple: SimpleEvaluator, + projectId?: string +): EvaluatorConfig { + const tagColors = getTagColors() + const evaluatorKey = extractEvaluatorKey(simple.data?.uri) + + return { + id: simple.id, + name: simple.name || "", + evaluator_key: evaluatorKey, + settings_values: simple.data?.parameters || {}, + created_at: simple.created_at, + updated_at: simple.updated_at, + // Frontend additions + color: tagColors[stringToNumberInRange(evaluatorKey, 0, tagColors.length - 1)], + tags: simple.tags, + } +} + +/** + * Convert internal EvaluatorConfig to SimpleEvaluator create payload + */ +export function configToSimpleEvaluatorCreate( + config: Omit, + outputsSchema?: Record +): SimpleEvaluatorCreate { + return { + slug: generateSlug(config.name), + name: config.name, + flags: { is_evaluator: true }, + data: { + uri: buildEvaluatorUri(config.evaluator_key), + parameters: config.settings_values, + schemas: outputsSchema ? { outputs: outputsSchema } : undefined, + }, + } +} + +/** + * Generate slug from name + */ +function generateSlug(name: string): string { + return name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-|-$/g, "") +} +``` + +#### 1.3 Create New Service Functions + +**File:** `web/oss/src/services/evaluators/index.ts` (add to existing) + +```typescript +// === NEW ENDPOINT FUNCTIONS === + +export const fetchAllEvaluatorConfigsV2 = async ( + projectIdOverride?: string | null, +): Promise => { + const {projectId: projectIdFromStore} = getProjectValues() + const projectId = projectIdOverride ?? projectIdFromStore + + if (!projectId) return [] + + const response = await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`, + { flags: { is_evaluator: true } } + ) + + const evaluators = response.data?.evaluators || [] + return evaluators.map((e: SimpleEvaluator) => simpleEvaluatorToConfig(e, projectId)) +} + +export const createEvaluatorConfigV2 = async ( + config: CreateEvaluationConfigData, +): Promise => { + const {projectId} = getProjectValues() + + const payload = configToSimpleEvaluatorCreate(config) + + const response = await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/?project_id=${projectId}`, + payload, + ) + + const simple = response.data?.evaluator + if (!simple) throw new Error("Failed to create evaluator") + + return simpleEvaluatorToConfig(simple, projectId) +} + +export const updateEvaluatorConfigV2 = async ( + configId: string, + config: Partial, +): Promise => { + const {projectId} = getProjectValues() + + const payload: SimpleEvaluatorEdit = { + id: configId, + name: config.name, + data: config.settings_values + ? { parameters: config.settings_values } + : undefined, + } + + const response = await axios.put( + `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}?project_id=${projectId}`, + payload, + ) + + const simple = response.data?.evaluator + if (!simple) throw new Error("Failed to update evaluator") + + return simpleEvaluatorToConfig(simple, projectId) +} + +export const deleteEvaluatorConfigV2 = async (configId: string): Promise => { + const {projectId} = getProjectValues() + + await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}/archive?project_id=${projectId}`, + ) + + return true +} +``` + +**Deliverables:** +- [ ] Type definitions for new API shapes +- [ ] Adapter functions (both directions) +- [ ] New service functions with V2 suffix +- [ ] Unit tests for adapters + +**Estimated Effort:** 1-2 days + +--- + +## Phase 2: Feature Flag Integration (Low Risk) + +**Goal:** Add feature flag to toggle between old and new endpoints + +### Tasks + +#### 2.1 Add Feature Flag + +**File:** `web/oss/src/lib/helpers/featureFlags.ts` or environment config + +```typescript +export const USE_NEW_EVALUATOR_ENDPOINTS = + process.env.NEXT_PUBLIC_USE_NEW_EVALUATOR_ENDPOINTS === "true" +``` + +#### 2.2 Create Unified Service Functions + +**File:** `web/oss/src/services/evaluators/index.ts` + +```typescript +// Unified functions that use feature flag +export const fetchAllEvaluatorConfigs = async (...args) => { + if (USE_NEW_EVALUATOR_ENDPOINTS) { + return fetchAllEvaluatorConfigsV2(...args) + } + return fetchAllEvaluatorConfigsLegacy(...args) +} + +export const createEvaluatorConfig = async (...args) => { + if (USE_NEW_EVALUATOR_ENDPOINTS) { + return createEvaluatorConfigV2(...args) + } + return createEvaluatorConfigLegacy(...args) +} + +// ... same for update and delete +``` + +**Deliverables:** +- [ ] Feature flag configuration +- [ ] Unified service functions with flag branching +- [ ] Documentation for enabling flag + +**Estimated Effort:** 0.5 days + +--- + +## Phase 3: Integration Testing (Medium Risk) + +**Goal:** Verify new endpoints work correctly with existing UI + +### Tasks + +#### 3.1 Enable Feature Flag in Development + +- Set `NEXT_PUBLIC_USE_NEW_EVALUATOR_ENDPOINTS=true` in dev environment +- Test all evaluator playground flows + +#### 3.2 Test Cases + +1. **List Evaluators** + - [ ] Registry shows all existing evaluator configs + - [ ] Correct names, types, and icons displayed + - [ ] Filtering and search work + +2. **Create Evaluator** + - [ ] Select template → Configure → Commit + - [ ] Settings saved correctly + - [ ] Redirects to edit page after create + +3. **Edit Evaluator** + - [ ] Load existing config + - [ ] Form populated with current values + - [ ] Update settings + - [ ] Changes persisted + +4. **Delete Evaluator** + - [ ] Delete confirmation works + - [ ] Evaluator removed from list + - [ ] No errors + +5. **Test Evaluator** + - [ ] Load testcase + - [ ] Run variant + - [ ] Run evaluator + - [ ] Results displayed correctly + +**Deliverables:** +- [ ] Test results document +- [ ] Bug fixes for any issues found +- [ ] Performance comparison (if applicable) + +**Estimated Effort:** 2-3 days + +--- + +## Phase 4: Gradual Rollout (Low Risk) + +**Goal:** Enable new endpoints for subset of users + +### Tasks + +#### 4.1 Staged Rollout + +1. **Internal testing:** Enable for team members only +2. **Beta users:** Enable for opt-in users +3. **General availability:** Enable for all users + +#### 4.2 Monitoring + +- Monitor error rates for evaluator operations +- Track API response times +- Watch for unexpected 404/500 errors + +**Deliverables:** +- [ ] Rollout schedule +- [ ] Rollback procedure documented +- [ ] Monitoring dashboards/alerts + +**Estimated Effort:** 1-2 weeks (elapsed time) + +--- + +## Phase 5: Cleanup (Low Risk) + +**Goal:** Remove legacy code and feature flag + +### Tasks + +#### 5.1 Remove Legacy Functions + +- Remove `fetchAllEvaluatorConfigsLegacy` +- Remove `createEvaluatorConfigLegacy` +- Remove `updateEvaluatorConfigLegacy` +- Remove `deleteEvaluatorConfigLegacy` + +#### 5.2 Remove Feature Flag + +- Remove feature flag checks +- Clean up V2 suffix from function names + +#### 5.3 Update Documentation + +- Update API documentation +- Update developer docs + +**Deliverables:** +- [ ] Legacy code removed +- [ ] Feature flag removed +- [ ] Documentation updated +- [ ] PR for cleanup + +**Estimated Effort:** 1 day + +--- + +## Timeline Summary + +| Phase | Duration | Risk | Dependencies | +|-------|----------|------|--------------| +| Phase 1: Foundation | 1-2 days | Low | None | +| Phase 2: Feature Flag | 0.5 days | Low | Phase 1 | +| Phase 3: Integration Testing | 2-3 days | Medium | Phase 2, Backend PR merged | +| Phase 4: Gradual Rollout | 1-2 weeks | Low | Phase 3 | +| Phase 5: Cleanup | 1 day | Low | Phase 4 complete | + +**Total Implementation Time:** ~5-7 days +**Total Rollout Time:** ~2-3 weeks + +--- + +## Rollback Plan + +If issues are discovered after deployment: + +1. **Immediate:** Set feature flag to `false` +2. **Short-term:** Deploy hotfix to disable new endpoints +3. **Investigation:** Analyze issues with new endpoints +4. **Resolution:** Fix and re-test before re-enabling + +--- + +## Open Questions + +1. **Output Schema Generation:** Should the frontend generate output schemas when creating evaluators, or should the backend handle this? + - Current PR shows backend generates schemas during migration + - Frontend may need to include schema for new configs + +2. **Slug Generation:** Should slugs be generated client-side or server-side? + - Server-side is safer (uniqueness checks) + - Client-side is faster (no round-trip) + +3. **Error Handling:** How should the frontend handle validation errors from new endpoints? + - New endpoints may return different error shapes + - Need to map to user-friendly messages diff --git a/docs/design/migrate-evaluator-playground/research.md b/docs/design/migrate-evaluator-playground/research.md new file mode 100644 index 0000000000..eda511d37b --- /dev/null +++ b/docs/design/migrate-evaluator-playground/research.md @@ -0,0 +1,211 @@ +# Research Notes: Evaluator Execution Architecture + +## Findings from PR #3527 Investigation + +### Discovery: Native Evaluator Execution Path + +The new architecture treats evaluators as workflows with URI-based identification. The key discovery is that even the legacy `/evaluators/{key}/run/` endpoint now uses the native handler registry internally. + +### Handler Registry Architecture + +The SDK maintains a global registry of workflow handlers: + +**Location:** `sdk/agenta/sdk/workflows/utils.py` + +```python +HANDLER_REGISTRY = { + "agenta": { + "builtin": { + "echo": {"v0": echo_v0}, + "auto_exact_match": {"v0": auto_exact_match_v0}, + "auto_regex_test": {"v0": auto_regex_test_v0}, + "field_match_test": {"v0": field_match_test_v0}, + "json_multi_field_match": {"v0": json_multi_field_match_v0}, + "auto_webhook_test": {"v0": auto_webhook_test_v0}, + "auto_custom_code_run": {"v0": auto_custom_code_run_v0}, + "auto_ai_critique": {"v0": auto_ai_critique_v0}, + # ... more evaluators + } + }, + "user": { + "custom": { + # Custom user evaluators + } + } +} +``` + +**URI Format:** `provider:kind:key:version` + +Examples: +- `agenta:builtin:auto_exact_match:v0` +- `user:custom:my_custom_eval:latest` + +**URI Parsing:** +```python +def parse_uri(uri: str) -> Tuple[provider, kind, key, version]: + # "agenta:builtin:echo:v0" → ("agenta", "builtin", "echo", "v0") +``` + +### How the Legacy Run Endpoint Works Now (PR #3527) + +**File:** `api/oss/src/routers/evaluators_router.py` + +The PR changed the implementation to use the native handler registry: + +```python +@router.post("/{evaluator_key}/run/", response_model=EvaluatorOutputInterface) +async def evaluator_run(request: Request, evaluator_key: str, payload: EvaluatorInputInterface): + # ... auth setup ... + result = await _run_evaluator(evaluator_key, payload) + return result + +async def _run_evaluator(evaluator_key: str, evaluator_input: EvaluatorInputInterface): + # Build URI from evaluator_key + uri = f"agenta:builtin:{evaluator_key}:v0" + + # Retrieve the handler from SDK registry + handler = retrieve_handler(uri) + if handler is None: + raise NotImplementedError(f"Evaluator {evaluator_key} not found (uri={uri})") + + # Extract data from evaluator_input + inputs = evaluator_input.inputs or {} + settings = evaluator_input.settings or {} + outputs = inputs.get("prediction", inputs.get("output")) + + # Build kwargs based on handler signature + sig = inspect.signature(handler) + kwargs = {} + if "parameters" in sig.parameters: + kwargs["parameters"] = settings + if "inputs" in sig.parameters: + kwargs["inputs"] = inputs + if "outputs" in sig.parameters: + kwargs["outputs"] = outputs + + # Invoke the handler + result = handler(**kwargs) + if inspect.iscoroutine(result): + result = await result + + return {"outputs": result} +``` + +**Key Insight:** The legacy endpoint is now a thin wrapper that: +1. Builds the URI from the evaluator_key +2. Looks up the handler in the registry +3. Invokes it directly + +### Native Workflow Invoke Path + +For fully native execution, there's also a generic workflow invoke endpoint: + +**Endpoint:** `POST /preview/workflows/invoke` + +**Request Structure:** +```python +class WorkflowServiceRequest: + data: WorkflowServiceRequestData # inputs, outputs, parameters + revision: Optional[dict] # contains URI in data.uri +``` + +**How Batch Evaluations Use It:** + +**File:** `api/oss/src/core/evaluations/tasks/legacy.py` (lines 1185-1228) + +```python +workflow_service_request_data = WorkflowServiceRequestData( + inputs=inputs, + outputs=outputs, + # + parameters=evaluator_reference.get("configuration"), # settings +) + +workflow_service_request = WorkflowServiceRequest( + data=workflow_service_request_data, + # + environment=environment, + revision=evaluator_reference.get("revision"), # contains URI +) + +await workflows_service.invoke_workflow( + project_id=project_id, + user_id=user_id, + request=workflow_service_request, +) +``` + +### Implications for Frontend Migration + +#### For Evaluator CRUD (Create/Read/Update/Delete) + +**Must migrate to new endpoints** because: +- Legacy endpoints now call SimpleEvaluator endpoints internally +- Data is stored in new workflow-based format +- Frontend should use native API to avoid translation overhead + +#### For Evaluator Run (Testing in Playground) + +**Options:** + +1. **Keep using `/evaluators/{key}/run/`** (Recommended for now) + - Simplest approach + - Endpoint still works + - Internally uses native path + - No frontend changes needed + +2. **Use native workflow invoke** + - Requires building `WorkflowServiceRequest` + - Need to include evaluator revision with URI + - More complex but more "correct" + - Enables custom evaluator support + +3. **Hybrid approach** + - Use legacy endpoint for built-in evaluators + - Use native invoke for custom evaluators (which will have custom URIs) + +### Questions Resolved + +**Q: Why does the legacy run endpoint remain unchanged?** + +A: It's not unchanged internally - PR #3527 refactored it to use the native handler registry. But the external interface (URL, request/response format) is preserved for backward compatibility. + +**Q: Is there a "native" way to run evaluators?** + +A: Yes, via the workflow invoke endpoint with `WorkflowServiceRequest` containing the evaluator's URI. But for the playground, the legacy endpoint is simpler and equivalent. + +**Q: Should we migrate the run endpoint usage?** + +A: Not necessarily. The benefits of migrating would be: +- Consistency with new architecture +- Support for custom evaluators with custom URIs +- Ability to run specific evaluator revisions + +But the costs are: +- More complex payload construction +- Need to fetch evaluator revision to get URI +- No immediate user-facing benefit + +**Recommendation:** Keep using legacy run endpoint for now, plan native invoke for custom evaluator feature. + +## Note on "Qdrant changes" + +Within this repository, Qdrant appears in examples and cookbook/tutorial code (e.g., `examples/python/*`, `docs/docs/tutorials/*`), but not in the core evaluator/workflow execution path under `api/oss/src`. + +Implication for this migration: +- Migrating the evaluator playground to `/preview/workflows/invoke` does not require any Qdrant-specific frontend changes. +- Any Qdrant-related behavior is part of the *application/workflow being evaluated* (e.g., a RAG app calling Qdrant), and would surface only through normal workflow invocation inputs/outputs/traces. + +--- + +## Related Files Analyzed + +- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (now with native internals) +- `api/oss/src/apis/fastapi/evaluators/router.py` - New SimpleEvaluators router +- `api/oss/src/apis/fastapi/workflows/router.py` - Workflow invoke endpoint +- `api/oss/src/core/workflows/service.py` - Workflow invocation service +- `api/oss/src/core/evaluations/tasks/legacy.py` - Batch evaluation using native invoke +- `sdk/agenta/sdk/workflows/utils.py` - Handler registry and URI parsing +- `sdk/agenta/sdk/workflows/interfaces.py` - Evaluator interfaces (schemas) +- `sdk/agenta/sdk/workflows/handlers.py` - Actual evaluator implementations diff --git a/docs/design/migrate-evaluator-playground/risk-analysis.md b/docs/design/migrate-evaluator-playground/risk-analysis.md new file mode 100644 index 0000000000..0bd037f0a0 --- /dev/null +++ b/docs/design/migrate-evaluator-playground/risk-analysis.md @@ -0,0 +1,287 @@ +# Risk Analysis: Evaluator Playground Migration + +## Coupling Points + +### 1. State Management Coupling + +**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts` + +**Risk Level:** MEDIUM + +The playground state is tightly coupled to the `EvaluatorConfig` shape: + +```typescript +// playgroundEditValuesAtom expects EvaluatorConfig shape +interface EvaluatorConfig { + id: string + evaluator_key: string + name: string + settings_values: Record +} +``` + +**Impact:** +- `commitPlaygroundAtom` expects `EvaluatorConfig` as input +- `playgroundEditValuesAtom` is read throughout ConfigureEvaluator and DebugSection +- Form initialization relies on `settings_values` property name + +**Mitigation:** +- Create adapter functions to convert between `SimpleEvaluator` and internal state +- Or update atoms to use `SimpleEvaluator` shape and update all consumers + +--- + +### 2. Form Initialization Coupling + +**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx` + +**Risk Level:** MEDIUM + +Form initialization directly accesses `settings_values`: + +```typescript +// Line 383-410 +if (editMode && editEvalEditValues) { + form.setFieldsValue({ + ...editEvalEditValues, + settings_values: editEvalEditValues.settings_values || {}, + }) +} +``` + +**Impact:** +- Changing to `data.parameters` would break form binding +- DynamicFormField components use `["settings_values", field.key]` name paths + +**Mitigation:** +- Keep internal form structure as `settings_values` +- Transform on API boundary (adapter pattern) + +--- + +### 3. Service Layer Coupling + +**Location:** `web/oss/src/services/evaluators/index.ts` + +**Risk Level:** LOW-MEDIUM + +API calls directly construct legacy payload shapes: + +```typescript +// createEvaluatorConfig +return axios.post(`/evaluators/configs?project_id=${projectId}`, { + ...config, +}) + +// updateEvaluatorConfig +return axios.put(`/evaluators/configs/${configId}?project_id=${projectId}`, config) +``` + +**Impact:** +- Need to update URLs and payload transformation +- Response handling needs to unwrap `{ evaluator: ... }` wrapper + +**Mitigation:** +- Create new service functions for new endpoints +- Keep old functions temporarily for gradual migration +- Add response/request transformers + +--- + +### 4. Evaluators Registry Coupling + +**Location:** `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` + +**Risk Level:** MEDIUM + +The hook transforms and combines data from two sources: + +```typescript +const {evaluatorConfigs} = useFetchEvaluatorsData() +// Combines with evaluator templates for display +``` + +**Impact:** +- Table columns expect `evaluator_key` property +- Tag cells, type pills depend on config shape +- Filtering/search operates on legacy property names + +**Mitigation:** +- Update hook to handle new `SimpleEvaluator` shape +- Transform data at fetch boundary, keep internal shape consistent + +--- + +### 5. Debug Section - Evaluator Run Coupling + +**Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx` + +**Risk Level:** LOW + +The evaluator run uses `evaluator_key` directly: + +```typescript +// Line 456 +const runResponse = await createEvaluatorRunExecution( + selectedEvaluator.key, // evaluator_key + { inputs: outputs, settings: ... } +) +``` + +**Impact:** +- This endpoint (`/evaluators/{key}/run/`) remains unchanged +- Uses `selectedEvaluator.key` from template, not config +- No direct coupling to `EvaluatorConfig` shape + +**Mitigation:** +- No changes needed for run functionality +- Keep using evaluator templates for the `key` value + +--- + +### 6. Global Atoms Coupling + +**Location:** `web/oss/src/state/evaluators/atoms.ts` + +**Risk Level:** MEDIUM + +Query atoms return legacy-shaped data: + +```typescript +const evaluatorConfigsQueryAtomFamily = atomFamily((projectId) => + atomWithQuery(() => ({ + queryKey: ['evaluator-configs', projectId], + queryFn: () => fetchAllEvaluatorConfigs(null, projectId), + })) +) +``` + +**Impact:** +- Multiple components may depend on these atoms +- Changing shape could cascade through application + +**Mitigation:** +- Update query functions to use new endpoints +- Transform response at query boundary to maintain internal shape +- Or update all consumers to handle new shape + +--- + +### 7. Evaluator Templates vs Configs Distinction + +**Location:** Throughout frontend + +**Risk Level:** LOW + +The frontend distinguishes between: +- **Evaluator templates** (`Evaluator`): Built-in evaluator definitions with `settings_template` +- **Evaluator configs** (`EvaluatorConfig`): User-created configurations with `settings_values` + +**Impact:** +- This distinction is maintained in the new system +- Templates come from `/evaluators/` (unchanged) +- Configs become `SimpleEvaluator` objects + +**Mitigation:** +- No conceptual change needed +- Just update config handling + +--- + +## Risk Summary Table + +| Component | Risk Level | Complexity | Priority | +|-----------|-----------|------------|----------| +| Service Layer | LOW-MEDIUM | LOW | HIGH (change first) | +| State Atoms | MEDIUM | MEDIUM | HIGH | +| ConfigureEvaluator Form | MEDIUM | MEDIUM | MEDIUM | +| Evaluators Registry | MEDIUM | MEDIUM | MEDIUM | +| Debug Section | LOW | LOW | LOW | +| Global Query Atoms | MEDIUM | LOW | MEDIUM | + +## Concrete Breakage Scenarios + +### Scenario 1: Form Submission Fails + +**Trigger:** Change `settings_values` to `data.parameters` without updating form + +**Symptoms:** +- Form submits but settings are lost +- Backend receives empty configuration +- Evaluator created but doesn't work + +**Prevention:** +- Transform at API boundary, not in form +- Test form submission with real backend + +--- + +### Scenario 2: Evaluator List Empty + +**Trigger:** Query endpoint returns new shape, UI expects old + +**Symptoms:** +- Evaluators registry shows empty list +- No error messages (data exists but unparseable) +- Console shows undefined property access + +**Prevention:** +- Update data transformation in hook +- Add null checks and fallbacks +- Log transformation errors + +--- + +### Scenario 3: Edit Mode Fails to Load + +**Trigger:** `playgroundEditValuesAtom` receives `SimpleEvaluator`, expects `EvaluatorConfig` + +**Symptoms:** +- Navigate to edit page, form is empty +- Settings not populated +- Save overwrites with empty config + +**Prevention:** +- Transform at atom level +- Test edit flow with existing configs + +--- + +### Scenario 4: Delete Fails Silently + +**Trigger:** `DELETE` endpoint no longer exists, `POST .../archive` required + +**Symptoms:** +- Click delete, no error +- Evaluator still appears +- Network tab shows 404/405 + +**Prevention:** +- Update delete function to use archive endpoint +- Verify response handling + +--- + +## Recommended Testing Strategy + +### Unit Tests +- [ ] Service layer transformers (old shape ↔ new shape) +- [ ] URI parsing (`agenta:builtin:key:v0` → `key`) +- [ ] Slug generation from name + +### Integration Tests +- [ ] Create evaluator config flow +- [ ] Edit evaluator config flow +- [ ] Delete (archive) evaluator config flow +- [ ] List/query evaluator configs flow + +### E2E Tests +- [ ] Full playground flow: select template → configure → test → commit +- [ ] Edit existing evaluator configuration +- [ ] Clone evaluator configuration +- [ ] Delete evaluator configuration + +### Regression Tests +- [ ] Evaluator run still works +- [ ] Batch evaluations still work (use config IDs) +- [ ] Existing configs load correctly after migration diff --git a/docs/design/migrate-evaluator-playground/status.md b/docs/design/migrate-evaluator-playground/status.md new file mode 100644 index 0000000000..e0f32606eb --- /dev/null +++ b/docs/design/migrate-evaluator-playground/status.md @@ -0,0 +1,134 @@ +# Status: Evaluator Playground Migration + +## Current Phase: Research Complete + +**Last Updated:** 2026-01-27 + +--- + +## Progress Summary + +### Completed + +- [x] Map current Evaluator Playground implementation + - Identified all frontend components + - Documented state management (atoms) + - Mapped API endpoints used + - Documented data flow + +- [x] Analyze PR #3527 (backend migration) + - Understood new `SimpleEvaluator` data model + - Documented new endpoint shapes + - Identified backward compatibility layer + +- [x] Investigate native evaluator execution path + - Confirmed `/evaluators/{key}/run` now resolves `agenta:builtin:{key}:v0` via SDK handler registry + - Confirmed native workflow execution endpoint exists: `POST /preview/workflows/invoke` + - Documented request structure used by batch evaluation tasks + +- [x] Compare old vs new endpoints + - Documented request/response differences + - Identified URI-based evaluator identification + - Noted response wrapper changes + +- [x] Identify coupling and risk areas + - State management coupling (MEDIUM risk) + - Form initialization coupling (MEDIUM risk) + - Service layer coupling (LOW-MEDIUM risk) + - Created risk mitigation strategies + +- [x] Propose migration plan + - Adapter pattern approach + - Feature flag integration + - Phased rollout strategy + +### In Progress + +- [ ] Phase 1: Foundation - Not started + +### Blocked + +- [ ] Phase 3: Integration Testing - Blocked on PR #3527 merge + +--- + +## Key Findings + +### 1. The `/evaluators/{key}/run/` endpoint works but is now a wrapper + +**Important Discovery:** PR #3527 refactored the legacy run endpoint to use the native handler registry internally: +- It builds a URI from the evaluator_key: `agenta:builtin:{key}:v0` +- Uses `retrieve_handler(uri)` to get the actual handler function +- Directly invokes the handler + +**Implication:** The external interface is unchanged, but internally it uses the new architecture. + +### 2. Native workflow invoke path exists + +There's a fully native way to run evaluators: +- Endpoint: `POST /preview/workflows/invoke` +- Uses `WorkflowServiceRequest` with URI in revision data +- Same mechanism used by batch evaluations + +**Recommendation:** Keep using legacy endpoint for now (simpler), consider native invoke for future custom evaluator support. + +### 3. URI-based handler registry + +The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions: +- Format: `agenta:builtin:{evaluator_key}:v0` +- Supports custom evaluators: `user:custom:my_eval:latest` +- Enables version management of evaluator implementations + +### 4. Adapter pattern minimizes risk + +By transforming data at the API boundary, we can: +- Keep internal data shapes unchanged +- Minimize code changes +- Enable easy rollback via feature flag + +### 5. Output schema handling + +The new `SimpleEvaluator` model includes explicit output schemas. The backend migration generates these from evaluator settings. For new configs: +- Built-in evaluators: Schema can be derived from evaluator type +- Custom evaluators: Schema should be provided by user + +--- + +## Decisions Made + +| Decision | Rationale | Date | +|----------|-----------|------| +| Use adapter pattern | Minimizes changes to internal code, enables gradual migration | 2026-01-27 | +| Feature flag approach | Allows gradual rollout and easy rollback | 2026-01-27 | +| Keep form structure as `settings_values` | Avoid cascading changes to form components | 2026-01-27 | + +--- + +## Open Questions + +1. **Run migration target:** For full migration, do we want the playground to invoke by: + - built-in key -> URI (`agenta:builtin:{key}:v0`), or + - evaluator revision URI stored on `SimpleEvaluator.data.uri` (preferred), or + - a specific evaluator revision id (even more explicit)? +2. **Output Schema:** Confirm whether frontend must provide `data.schemas.outputs` on create/edit, or backend will derive defaults. +3. **Slug Generation:** Client-side or server-side? + +--- + +## Next Steps + +1. Wait for PR #3527 to be merged +2. Start Phase 1: Create type definitions and adapters +3. Add feature flag infrastructure +4. Test with new endpoints + +--- + +## Related Links + +- [PR #3527: Migrate evaluators but keep legacy endpoints](https://github.com/Agenta-AI/agenta/pull/3527) +- [context.md](./context.md) - Background and goals +- [current-system.md](./current-system.md) - Current implementation details +- [new-endpoints.md](./new-endpoints.md) - New endpoint documentation +- [risk-analysis.md](./risk-analysis.md) - Coupling and risk analysis +- [plan.md](./plan.md) - Migration execution plan From df1e622917e48c9d5731890c95dee6355150472e Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 27 Jan 2026 19:06:38 +0100 Subject: [PATCH 2/4] docs: update plan to direct migration (no adapters), split into PR 1 (CRUD) and PR 2 (Run) --- .../migrate-evaluator-playground/README.md | 74 +- .../migrate-evaluator-playground/context.md | 45 +- .../migration-options.md | 153 ++-- .../migrate-evaluator-playground/plan.md | 740 ++++++++++++------ .../risk-analysis.md | 133 ++-- .../migrate-evaluator-playground/status.md | 99 +-- 6 files changed, 796 insertions(+), 448 deletions(-) diff --git a/docs/design/migrate-evaluator-playground/README.md b/docs/design/migrate-evaluator-playground/README.md index 4197c667d1..b0b9d0c319 100644 --- a/docs/design/migrate-evaluator-playground/README.md +++ b/docs/design/migrate-evaluator-playground/README.md @@ -2,12 +2,23 @@ ## Overview -This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model, and has created backward-compatible legacy endpoints. This migration will update the frontend to use the new endpoints directly. +This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model. + +## Migration Strategy + +**Direct migration (no adapters)** split into two PRs: + +| PR | Scope | Description | +|----|-------|-------------| +| **PR 1** | CRUD | Migrate to `/preview/simple/evaluators/*`, change internal types to `SimpleEvaluator` | +| **PR 2** | Run | Migrate to `/preview/workflows/invoke`, add workflow service types | + +See [plan.md](./plan.md) for detailed implementation steps. ## Context -- **PR #3527**: Backend migration that introduces new evaluator endpoints while keeping legacy endpoints for backward compatibility -- **Goal**: Migrate the Evaluator Playground frontend to use new endpoints, improving consistency with the new workflow-based architecture +- **PR #3527**: Backend migration that introduces new evaluator endpoints +- **Goal**: Full migration to new endpoints, no legacy code remaining ## Documents @@ -17,24 +28,57 @@ This planning workspace documents the migration of the Evaluator Playground fron | [current-system.md](./current-system.md) | Detailed map of current Evaluator Playground implementation | | [new-endpoints.md](./new-endpoints.md) | New evaluator endpoint shapes and differences from legacy | | [research.md](./research.md) | Deep dive into evaluator execution architecture and URI-based handlers | -| [migration-options.md](./migration-options.md) | Migration plan options: direct vs transitional approaches | +| [migration-options.md](./migration-options.md) | Why we chose direct migration over adapters | | [risk-analysis.md](./risk-analysis.md) | Coupling points and risk areas for the migration | -| [plan.md](./plan.md) | Migration execution plan with phases and milestones | +| [plan.md](./plan.md) | **Main plan** - PR 1 (CRUD) and PR 2 (Run) implementation details | | [status.md](./status.md) | Living document for progress updates and decisions | -## Key Files Affected +## Key Mapping Changes + +| Legacy | New | +|--------|-----| +| `EvaluatorConfig` | `SimpleEvaluator` | +| `evaluator_key` | derived from `data.uri` | +| `settings_values` | `data.parameters` | +| `GET /evaluators/configs/` | `POST /preview/simple/evaluators/query` | +| `POST /evaluators/configs/` | `POST /preview/simple/evaluators/` | +| `PUT /evaluators/configs/{id}/` | `PUT /preview/simple/evaluators/{id}` | +| `DELETE /evaluators/configs/{id}/` | `POST /preview/simple/evaluators/{id}/archive` | +| `POST /evaluators/{key}/run/` | `POST /preview/workflows/invoke` | -### Frontend - Core Components -- `web/oss/src/components/Evaluators/` - Evaluators registry -- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/` - Playground UI -- `web/oss/src/services/evaluators/index.ts` - API service layer -- `web/oss/src/services/evaluations/api_ee/index.ts` - Evaluator run execution +## Files Affected -### Frontend - State Management -- `web/oss/src/state/evaluators/atoms.ts` - Evaluator query atoms -- `web/oss/src/lib/atoms/evaluation.ts` - Legacy evaluation atoms +### PR 1: CRUD Migration + +| Area | Files | +|------|-------| +| Types | `web/oss/src/lib/Types.ts` | +| Services | `web/oss/src/services/evaluators/index.ts` | +| State | `web/oss/src/state/evaluators/atoms.ts` | +| Playground State | `web/oss/src/components/.../ConfigureEvaluator/state/atoms.ts` | +| Playground UI | `web/oss/src/components/.../ConfigureEvaluator/index.tsx` | +| Registry | `web/oss/src/components/Evaluators/index.tsx` | +| Registry Hook | `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` | +| Columns | `web/oss/src/components/Evaluators/assets/getColumns.tsx` | + +### PR 2: Run Migration + +| Area | Files | +|------|-------| +| Types | `web/oss/src/lib/Types.ts` (add workflow types) | +| Invoke Service | `web/oss/src/services/workflows/invoke.ts` (new) | +| Debug Section | `web/oss/src/components/.../ConfigureEvaluator/DebugSection.tsx` | ### Backend Reference (PR #3527) -- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept for backward compatibility) +- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept temporarily) - `api/oss/src/apis/fastapi/evaluators/router.py` - New `SimpleEvaluators` router +- `api/oss/src/apis/fastapi/workflows/router.py` - Workflow invoke endpoint - `api/oss/src/core/evaluators/dtos.py` - New data transfer objects + +## Effort Estimate + +| PR | Effort | +|----|--------| +| PR 1: CRUD | 4-5 days | +| PR 2: Run | 3-4 days | +| **Total** | **7-9 days** | diff --git a/docs/design/migrate-evaluator-playground/context.md b/docs/design/migrate-evaluator-playground/context.md index 4f2fdae9be..5fa82e8b21 100644 --- a/docs/design/migrate-evaluator-playground/context.md +++ b/docs/design/migrate-evaluator-playground/context.md @@ -24,34 +24,49 @@ The Evaluator Playground frontend currently uses legacy endpoints: The backend (PR #3527) has: 1. Migrated all evaluator configs to the new workflow-based model via DB migrations 2. Created new `SimpleEvaluators` endpoints at `/preview/simple/evaluators/` -3. Kept legacy endpoints as thin wrappers that convert new model back to legacy format +3. Native workflow execution available at `/preview/workflows/invoke` +4. Kept legacy endpoints as thin wrappers (to be deprecated) **The frontend needs to migrate to use the new endpoints directly.** ## Goals 1. **Replace legacy evaluator config CRUD** with new `SimpleEvaluator` endpoints -2. **Update data models** in frontend to match new `SimpleEvaluator` shape -3. **Maintain backward compatibility** during transition (feature flag or gradual rollout) -4. **Keep the evaluator run endpoint** (`/evaluators/{key}/run/`) - this remains unchanged -5. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality +2. **Replace legacy evaluator run** with native workflow invoke (`/preview/workflows/invoke`) +3. **Update data models** in frontend to match new `SimpleEvaluator` shape (no adapters) +4. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality +5. **Remove all legacy endpoint usage** - clean migration, no dual-path code ## Non-Goals -1. **Not migrating the evaluator run endpoint** - The `/evaluators/{key}/run/` endpoint is still used and works the same way -2. **Not changing the Evaluator Playground UI** - Only the data layer changes -3. **Not migrating evaluation batch runs** - Those use evaluator revision IDs which are handled by the backend migration -4. **Not introducing new evaluator features** - This is a pure backend migration +1. **Not changing the Evaluator Playground UI** - Only the data layer changes +2. **Not migrating evaluation batch runs** - Those already use the new workflow system internally +3. **Not introducing new evaluator features** - This is a pure endpoint migration ## Success Criteria -1. Evaluator Playground can create, edit, delete evaluators using new endpoints -2. All existing evaluator configurations continue to work -3. No regression in evaluator testing functionality -4. Clean removal of legacy endpoint usage in frontend +1. Evaluator Playground can create, edit, delete evaluators using new `SimpleEvaluator` endpoints +2. Evaluator Playground can run evaluators using native workflow invoke +3. All existing evaluator configurations continue to work +4. No regression in evaluator testing functionality +5. No legacy endpoint calls remain in frontend code ## Constraints 1. Must not break existing evaluator configurations -2. Must coordinate with backend team on endpoint availability -3. Should be deployable incrementally (not big-bang) +2. Must coordinate with backend team on endpoint availability (PR #3527) +3. Split into two PRs for reviewability (CRUD first, then Run) + +## Migration Approach + +**Direct migration (no adapters):** + +| PR | Scope | Endpoints | +|----|-------|-----------| +| PR 1 | CRUD | `/preview/simple/evaluators/*` | +| PR 2 | Run | `/preview/workflows/invoke` | + +This approach: +- Avoids tech debt from adapter layers +- Aligns internal types with backend models +- Keeps changes reviewable by splitting into two PRs diff --git a/docs/design/migrate-evaluator-playground/migration-options.md b/docs/design/migrate-evaluator-playground/migration-options.md index 6ea44db0de..40bf6b4caa 100644 --- a/docs/design/migrate-evaluator-playground/migration-options.md +++ b/docs/design/migrate-evaluator-playground/migration-options.md @@ -1,125 +1,106 @@ -# Migration Options (Plan A vs Plan B) +# Migration Options ## Goal Full migration of the Evaluator Playground to the new workflow-based evaluator APIs, including: -- CRUD on evaluator configs via `/preview/simple/evaluators/*` (or the richer `/preview/evaluators/*` family) +- CRUD on evaluator configs via `/preview/simple/evaluators/*` - Running evaluators via native workflow invocation (`/preview/workflows/invoke`) instead of the legacy `/evaluators/{key}/run` -This doc lists two concrete migration strategies. - --- -## Plan A (Transitional): Keep Internal Shapes Stable +## Option A (Rejected): Adapter Pattern -This is the earlier approach: keep the UI/state assuming the legacy `EvaluatorConfig` shape and translate at the API boundary. +Keep the UI/state assuming the legacy `EvaluatorConfig` shape and translate at the API boundary. -### Why it exists +### Why it was considered -- Minimizes touching UI/atoms/forms +- Minimizes touching UI/atoms/forms initially - Lets you swap endpoints quickly with limited regression surface - Good when backend is still stabilizing schemas -### Trade-offs +### Why it was rejected -- Adds an extra abstraction layer (adapters) -- Can delay paying down legacy assumptions (`settings_values`, `evaluator_key`, etc.) +- Adds tech debt (adapter layer becomes permanent) +- Delays alignment with new architecture +- Makes future changes harder (two mental models) --- -## Plan B (Preferred): Direct Migration (No Adapters) +## Option B (Chosen): Direct Migration -This changes the frontend domain model to match the backend reality: -- “Evaluator config” becomes `SimpleEvaluator` (workflow artifact w/ latest evaluator revision data attached) -- Execution uses workflow invocation (`/preview/workflows/invoke`) using evaluator `data.uri` +Change the frontend domain model to match the backend: +- "Evaluator config" becomes `SimpleEvaluator` +- Internal shapes use `data.parameters` instead of `settings_values` +- Internal shapes derive `evaluator_key` from `data.uri` -### Why it’s better long-term +### Why it's better -- Eliminates translation debt -- Aligns with “evaluators are workflows” concept end-to-end +- No translation debt +- Aligns with "evaluators are workflows" concept end-to-end - Unlocks revision-aware runs and custom evaluator URIs +- Cleaner codebase long-term -### Initial Scope (not exhaustive) - -#### 1) Data model and type changes - -- Introduce TS types for `SimpleEvaluator*` and `WorkflowService*` (request/response) -- Replace usages of `EvaluatorConfig` in the evaluator playground path with `SimpleEvaluator` - -Key places: -- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts` -- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx` -- `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` - -#### 2) CRUD endpoint swap (configs) - -Replace: -- `GET/POST/PUT/DELETE /evaluators/configs/*` - -With: -- `POST /preview/simple/evaluators/query` -- `POST /preview/simple/evaluators/` -- `PUT /preview/simple/evaluators/{id}` -- `POST /preview/simple/evaluators/{id}/archive` - -Key files: -- `web/oss/src/services/evaluators/index.ts` -- `web/oss/src/state/evaluators/atoms.ts` - -Notes: -- `evaluator_key` is now derived from `simpleEvaluator.data.uri` (or carried separately in UI state) -- Settings are now `simpleEvaluator.data.parameters` +--- -#### 3) Run endpoint swap (native invoke) +## Execution Strategy -Replace: -- `POST /evaluators/{evaluator_key}/run` +To keep changes reviewable while avoiding adapters: -With: -- `POST /preview/workflows/invoke` +### PR 1: CRUD Migration +- Migrate all CRUD operations to `/preview/simple/evaluators/*` +- Change internal types from `EvaluatorConfig` to `SimpleEvaluator` +- Update atoms, services, and components +- Keep legacy run endpoint temporarily -What needs changing in the playground: -- `DebugSection.tsx` currently uses `createEvaluatorRunExecution(evaluatorKey, {inputs, settings})` -- New call should construct `WorkflowServiceRequest`: - - `interface.uri` (or `configuration`+`interface`) derived from evaluator `data` / built-in key - - `data.inputs` (merged testcase + prediction) - - `data.outputs` (prediction/output) - - `data.parameters` (settings) +### PR 2: Run Migration +- Migrate run from `/evaluators/{key}/run` to `/preview/workflows/invoke` +- Add `WorkflowServiceRequest/Response` types +- Update `DebugSection.tsx` to use native invoke -Key file: -- `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx` -- plus a new service client, e.g. `web/oss/src/services/workflows/invoke.ts` +This sequencing: +1. Isolates CRUD changes for easier review +2. Allows CRUD to stabilize before changing run +3. Avoids adapter layer entirely +4. Results in full migration with no legacy code -#### 4) Registry/list UI adjustments +--- -The evaluators registry table expects legacy `evaluator_key` and `settings_values`. Under Plan B: -- The list source becomes `SimpleEvaluator[]` -- Table columns need to read from `data.uri` and `data.parameters` +## Files Affected -Key files: -- `web/oss/src/components/Evaluators/index.tsx` -- `web/oss/src/components/Evaluators/assets/getColumns.tsx` -- `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` +### PR 1 (CRUD) -#### 5) Permissions and error handling +| Area | Files | +|------|-------| +| Types | `web/oss/src/lib/Types.ts` | +| Services | `web/oss/src/services/evaluators/index.ts` | +| State | `web/oss/src/state/evaluators/atoms.ts` | +| Playground State | `web/oss/src/components/.../ConfigureEvaluator/state/atoms.ts` | +| Playground UI | `web/oss/src/components/.../ConfigureEvaluator/index.tsx` | +| Registry | `web/oss/src/components/Evaluators/index.tsx` | +| Registry Hook | `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` | +| Columns | `web/oss/src/components/Evaluators/assets/getColumns.tsx` | -Native invoke uses `RUN_WORKFLOWS` permission (backend check). Expect: -- Different 403 behavior for some users -- Different error shape: workflow service returns `status.code/message` in response +### PR 2 (Run) -UI needs: -- Map workflow error status to `message.error` and output editor +| Area | Files | +|------|-------| +| Types | `web/oss/src/lib/Types.ts` (add workflow types) | +| Invoke Service | `web/oss/src/services/workflows/invoke.ts` (new) | +| Debug Section | `web/oss/src/components/.../ConfigureEvaluator/DebugSection.tsx` | --- -## Practical Recommendation - -If the objective is “duplicate all endpoints and fully migrate”, Plan B is the right destination. - -To reduce risk while still avoiding adapters, a pragmatic sequencing is: +## Key Mapping Changes -1) Migrate CRUD to SimpleEvaluator endpoints (Plan B) -2) Keep legacy run for 1-2 PRs while CRUD stabilizes -3) Migrate run to `/preview/workflows/invoke` (Plan B completion) +| Legacy | New | +|--------|-----| +| `EvaluatorConfig` | `SimpleEvaluator` | +| `evaluator_key` | derived from `data.uri` | +| `settings_values` | `data.parameters` | +| `GET /evaluators/configs/` | `POST /preview/simple/evaluators/query` | +| `POST /evaluators/configs/` | `POST /preview/simple/evaluators/` | +| `PUT /evaluators/configs/{id}/` | `PUT /preview/simple/evaluators/{id}` | +| `DELETE /evaluators/configs/{id}/` | `POST /preview/simple/evaluators/{id}/archive` | +| `POST /evaluators/{key}/run/` | `POST /preview/workflows/invoke` | -This keeps changes reviewable without introducing a permanent adapter layer. +See [plan.md](./plan.md) for detailed implementation steps. diff --git a/docs/design/migrate-evaluator-playground/plan.md b/docs/design/migrate-evaluator-playground/plan.md index 35a1f9eb4a..a234ec2111 100644 --- a/docs/design/migrate-evaluator-playground/plan.md +++ b/docs/design/migrate-evaluator-playground/plan.md @@ -2,63 +2,67 @@ ## Overview -This plan outlines an incremental migration approach that minimizes risk and allows for gradual rollout. The key principle is **transform at boundaries** - keep internal data shapes stable and only change API interactions. +Full migration of the Evaluator Playground to the new workflow-based evaluator APIs. This plan follows **Plan B (Direct Migration)** - no adapters, internal shapes change to match the new `SimpleEvaluator` model. ## Migration Strategy -Two viable strategies exist: +**Two PRs, no adapters:** -- Plan A (transitional): adapter pattern, keep internal legacy `EvaluatorConfig` shape -- Plan B (preferred destination): direct migration, internal shapes become `SimpleEvaluator` + native invoke +1. **PR 1:** Migrate CRUD to `SimpleEvaluator` endpoints (internal shapes change) +2. **PR 2:** Migrate run to native workflow invoke (`/preview/workflows/invoke`) -This file documents Plan A as the low-risk execution plan. For the direct plan, see `docs/design/migrate-evaluator-playground/migration-options.md`. - -## Plan A: Adapter Pattern - -Instead of changing data shapes throughout the codebase, we'll: -1. Create adapter functions at the API boundary -2. New endpoints return `SimpleEvaluator`, adapters convert to internal `EvaluatorConfig` shape -3. Internal components continue working unchanged -4. Gradually update internals later (optional) +This keeps changes reviewable while avoiding tech debt from adapter layers. ``` -┌──────────────┐ ┌──────────────┐ ┌──────────────────┐ -│ New API │ ──► │ Adapter │ ──► │ Internal Shape │ -│ Endpoints │ │ Layer │ │ (unchanged) │ -└──────────────┘ └──────────────┘ └──────────────────┘ +PR 1: CRUD Migration +┌─────────────────────────────────────────────────────────────────┐ +│ EvaluatorConfig → SimpleEvaluator │ +│ /evaluators/configs/* → /preview/simple/evaluators/* │ +│ settings_values → data.parameters │ +│ evaluator_key → data.uri │ +└─────────────────────────────────────────────────────────────────┘ + +PR 2: Run Migration +┌─────────────────────────────────────────────────────────────────┐ +│ /evaluators/{key}/run → /preview/workflows/invoke │ +│ EvaluatorInputInterface → WorkflowServiceRequest │ +└─────────────────────────────────────────────────────────────────┘ ``` --- -## Phase 1: Foundation (Low Risk) - -**Goal:** Create adapter layer and new service functions without changing existing code +## PR 1: CRUD Migration -### Tasks +**Goal:** Replace legacy evaluator config endpoints with new SimpleEvaluator endpoints. Change internal data model from `EvaluatorConfig` to `SimpleEvaluator`. -#### 1.1 Create Type Definitions +### Phase 1.1: Type Definitions -**File:** `web/oss/src/lib/Types.ts` or new file `web/oss/src/services/evaluators/types.ts` +**File:** `web/oss/src/lib/Types.ts` (add to existing types) ```typescript -// New API types -interface SimpleEvaluatorData { +// ============ SimpleEvaluator Types ============ + +export interface SimpleEvaluatorData { version?: string - uri?: string - url?: string + uri?: string // e.g., "agenta:builtin:auto_exact_match:v0" + url?: string // for webhook evaluators headers?: Record - schemas?: { outputs?: Record } + schemas?: { + outputs?: Record + inputs?: Record + parameters?: Record + } script?: { content: string; runtime: string } - parameters?: Record + parameters?: Record // replaces settings_values } -interface SimpleEvaluatorFlags { +export interface SimpleEvaluatorFlags { is_custom?: boolean is_evaluator?: boolean is_human?: boolean } -interface SimpleEvaluator { +export interface SimpleEvaluator { id: string slug: string name?: string @@ -71,32 +75,54 @@ interface SimpleEvaluator { updated_at: string } -interface SimpleEvaluatorResponse { +export interface SimpleEvaluatorCreate { + slug: string + name?: string + description?: string + tags?: string[] + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData +} + +export interface SimpleEvaluatorEdit { + id: string + name?: string + description?: string + tags?: string[] + data?: SimpleEvaluatorData +} + +export interface SimpleEvaluatorResponse { count: number evaluator: SimpleEvaluator | null } -interface SimpleEvaluatorsResponse { +export interface SimpleEvaluatorsResponse { count: number evaluators: SimpleEvaluator[] } ``` -#### 1.2 Create Adapter Functions +**Deliverables:** +- [ ] Add `SimpleEvaluator*` types to Types.ts +- [ ] Keep `EvaluatorConfig` temporarily for areas not yet migrated + +--- + +### Phase 1.2: Service Layer Changes + +**File:** `web/oss/src/services/evaluators/index.ts` -**File:** `web/oss/src/services/evaluators/adapters.ts` +Replace legacy functions with new implementations: ```typescript -import { EvaluatorConfig } from "@/oss/lib/Types" -import { SimpleEvaluator, SimpleEvaluatorData } from "./types" -import { getTagColors } from "@/oss/lib/helpers/colors" -import { stringToNumberInRange } from "@/oss/lib/helpers/utils" +// ============ Helper Functions ============ /** * Extract evaluator_key from URI * URI format: "agenta:builtin:{key}:v0" */ -export function extractEvaluatorKey(uri: string | undefined): string { +export function extractEvaluatorKeyFromUri(uri: string | undefined): string { if (!uri) return "" const parts = uri.split(":") if (parts.length >= 3 && parts[0] === "agenta" && parts[1] === "builtin") { @@ -112,69 +138,23 @@ export function buildEvaluatorUri(evaluatorKey: string): string { return `agenta:builtin:${evaluatorKey}:v0` } -/** - * Convert SimpleEvaluator to internal EvaluatorConfig shape - */ -export function simpleEvaluatorToConfig( - simple: SimpleEvaluator, - projectId?: string -): EvaluatorConfig { - const tagColors = getTagColors() - const evaluatorKey = extractEvaluatorKey(simple.data?.uri) - - return { - id: simple.id, - name: simple.name || "", - evaluator_key: evaluatorKey, - settings_values: simple.data?.parameters || {}, - created_at: simple.created_at, - updated_at: simple.updated_at, - // Frontend additions - color: tagColors[stringToNumberInRange(evaluatorKey, 0, tagColors.length - 1)], - tags: simple.tags, - } -} - -/** - * Convert internal EvaluatorConfig to SimpleEvaluator create payload - */ -export function configToSimpleEvaluatorCreate( - config: Omit, - outputsSchema?: Record -): SimpleEvaluatorCreate { - return { - slug: generateSlug(config.name), - name: config.name, - flags: { is_evaluator: true }, - data: { - uri: buildEvaluatorUri(config.evaluator_key), - parameters: config.settings_values, - schemas: outputsSchema ? { outputs: outputsSchema } : undefined, - }, - } -} - /** * Generate slug from name */ -function generateSlug(name: string): string { +export function generateSlug(name: string): string { return name .toLowerCase() .replace(/[^a-z0-9]+/g, "-") .replace(/^-|-$/g, "") + .substring(0, 50) // limit length } -``` - -#### 1.3 Create New Service Functions -**File:** `web/oss/src/services/evaluators/index.ts` (add to existing) +// ============ CRUD Functions ============ -```typescript -// === NEW ENDPOINT FUNCTIONS === - -export const fetchAllEvaluatorConfigsV2 = async ( +export const fetchAllEvaluatorConfigs = async ( + _appId?: string | null, // kept for backward compat, ignored projectIdOverride?: string | null, -): Promise => { +): Promise => { const {projectId: projectIdFromStore} = getProjectValues() const projectId = projectIdOverride ?? projectIdFromStore @@ -182,269 +162,561 @@ export const fetchAllEvaluatorConfigsV2 = async ( const response = await axios.post( `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`, - { flags: { is_evaluator: true } } + { evaluator: { flags: { is_evaluator: true } } } ) - const evaluators = response.data?.evaluators || [] - return evaluators.map((e: SimpleEvaluator) => simpleEvaluatorToConfig(e, projectId)) + return response.data?.evaluators || [] } -export const createEvaluatorConfigV2 = async ( - config: CreateEvaluationConfigData, -): Promise => { +export const createEvaluatorConfig = async ( + evaluatorKey: string, + name: string, + settingsValues: Record, +): Promise => { const {projectId} = getProjectValues() - const payload = configToSimpleEvaluatorCreate(config) + const payload: SimpleEvaluatorCreate = { + slug: generateSlug(name), + name, + flags: { is_evaluator: true }, + data: { + uri: buildEvaluatorUri(evaluatorKey), + parameters: settingsValues, + }, + } const response = await axios.post( `${getAgentaApiUrl()}/preview/simple/evaluators/?project_id=${projectId}`, - payload, + { evaluator: payload }, ) - const simple = response.data?.evaluator - if (!simple) throw new Error("Failed to create evaluator") + const result = response.data?.evaluator + if (!result) throw new Error("Failed to create evaluator") - return simpleEvaluatorToConfig(simple, projectId) + return result } -export const updateEvaluatorConfigV2 = async ( - configId: string, - config: Partial, -): Promise => { +export const updateEvaluatorConfig = async ( + evaluatorId: string, + updates: { name?: string; settingsValues?: Record }, +): Promise => { const {projectId} = getProjectValues() const payload: SimpleEvaluatorEdit = { - id: configId, - name: config.name, - data: config.settings_values - ? { parameters: config.settings_values } + id: evaluatorId, + name: updates.name, + data: updates.settingsValues + ? { parameters: updates.settingsValues } : undefined, } const response = await axios.put( - `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}?project_id=${projectId}`, - payload, + `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}?project_id=${projectId}`, + { evaluator: payload }, ) - const simple = response.data?.evaluator - if (!simple) throw new Error("Failed to update evaluator") + const result = response.data?.evaluator + if (!result) throw new Error("Failed to update evaluator") - return simpleEvaluatorToConfig(simple, projectId) + return result } -export const deleteEvaluatorConfigV2 = async (configId: string): Promise => { +export const deleteEvaluatorConfig = async (evaluatorId: string): Promise => { const {projectId} = getProjectValues() await axios.post( - `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}/archive?project_id=${projectId}`, + `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}/archive?project_id=${projectId}`, ) return true } + +export const fetchEvaluatorById = async (evaluatorId: string): Promise => { + const {projectId} = getProjectValues() + + const response = await axios.get( + `${getAgentaApiUrl()}/preview/simple/evaluators/${evaluatorId}?project_id=${projectId}`, + ) + + return response.data?.evaluator || null +} ``` **Deliverables:** -- [ ] Type definitions for new API shapes -- [ ] Adapter functions (both directions) -- [ ] New service functions with V2 suffix -- [ ] Unit tests for adapters - -**Estimated Effort:** 1-2 days +- [ ] Replace `fetchAllEvaluatorConfigs` implementation +- [ ] Replace `createEvaluatorConfig` implementation +- [ ] Replace `updateEvaluatorConfig` implementation +- [ ] Replace `deleteEvaluatorConfig` implementation +- [ ] Add helper functions for URI handling +- [ ] Remove legacy endpoint calls --- -## Phase 2: Feature Flag Integration (Low Risk) - -**Goal:** Add feature flag to toggle between old and new endpoints +### Phase 1.3: State/Atoms Changes -### Tasks +**File:** `web/oss/src/state/evaluators/atoms.ts` -#### 2.1 Add Feature Flag - -**File:** `web/oss/src/lib/helpers/featureFlags.ts` or environment config +Update query atoms to return `SimpleEvaluator[]`: ```typescript -export const USE_NEW_EVALUATOR_ENDPOINTS = - process.env.NEXT_PUBLIC_USE_NEW_EVALUATOR_ENDPOINTS === "true" +export const evaluatorConfigsQueryAtomFamily = atomFamily((projectId: string | null) => + atomWithQuery(() => ({ + queryKey: ["evaluator-configs", projectId], + queryFn: () => fetchAllEvaluatorConfigs(null, projectId), + enabled: !!projectId, + })) +) + +// Derived atom for non-archived evaluators +export const nonArchivedEvaluatorsAtom = atom((get) => { + const projectId = get(projectIdAtom) + if (!projectId) return [] + + const query = get(evaluatorConfigsQueryAtomFamily(projectId)) + const evaluators = query.data ?? [] + + // Filter out archived (deleted_at is set) + return evaluators.filter((e) => !e.deleted_at) +}) ``` -#### 2.2 Create Unified Service Functions +**File:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts` -**File:** `web/oss/src/services/evaluators/index.ts` +Update playground atoms to use `SimpleEvaluator`: ```typescript -// Unified functions that use feature flag -export const fetchAllEvaluatorConfigs = async (...args) => { - if (USE_NEW_EVALUATOR_ENDPOINTS) { - return fetchAllEvaluatorConfigsV2(...args) - } - return fetchAllEvaluatorConfigsLegacy(...args) +// Session now stores SimpleEvaluator instead of EvaluatorConfig +export interface PlaygroundSession { + evaluator: Evaluator // template (unchanged) + simpleEvaluator?: SimpleEvaluator // existing config being edited + mode: "create" | "edit" | "clone" } -export const createEvaluatorConfig = async (...args) => { - if (USE_NEW_EVALUATOR_ENDPOINTS) { - return createEvaluatorConfigV2(...args) - } - return createEvaluatorConfigLegacy(...args) -} +export const playgroundSessionAtom = atom(null) -// ... same for update and delete +// Edit values now use SimpleEvaluator shape +export const playgroundEditValuesAtom = atom | null>(null) + +// Derived: get evaluator_key from URI +export const playgroundEvaluatorKeyAtom = atom((get) => { + const session = get(playgroundSessionAtom) + if (!session) return null + + // From template + if (session.evaluator?.key) return session.evaluator.key + + // From existing SimpleEvaluator + if (session.simpleEvaluator?.data?.uri) { + return extractEvaluatorKeyFromUri(session.simpleEvaluator.data.uri) + } + + return null +}) ``` **Deliverables:** -- [ ] Feature flag configuration -- [ ] Unified service functions with flag branching -- [ ] Documentation for enabling flag - -**Estimated Effort:** 0.5 days +- [ ] Update `evaluatorConfigsQueryAtomFamily` return type +- [ ] Update playground session atoms +- [ ] Update `playgroundEditValuesAtom` shape +- [ ] Add derived atoms for backward-compatible access (e.g., `evaluator_key`) --- -## Phase 3: Integration Testing (Medium Risk) +### Phase 1.4: Component Changes -**Goal:** Verify new endpoints work correctly with existing UI +#### ConfigureEvaluator/index.tsx -### Tasks +Key changes: +- Form fields read/write to `data.parameters` instead of `settings_values` +- On commit, build `SimpleEvaluatorCreate` or `SimpleEvaluatorEdit` +- Load existing config as `SimpleEvaluator` -#### 3.1 Enable Feature Flag in Development +```typescript +// Before +form.setFieldsValue({ + name: editEvalEditValues.name, + settings_values: editEvalEditValues.settings_values, +}) + +// After +form.setFieldsValue({ + name: simpleEvaluator.name, + parameters: simpleEvaluator.data?.parameters, +}) +``` -- Set `NEXT_PUBLIC_USE_NEW_EVALUATOR_ENDPOINTS=true` in dev environment -- Test all evaluator playground flows +#### useEvaluatorsRegistryData.ts -#### 3.2 Test Cases +Update to work with `SimpleEvaluator[]`: + +```typescript +// Derive evaluator_key for display +const enrichedEvaluators = evaluators.map((e) => ({ + ...e, + evaluator_key: extractEvaluatorKeyFromUri(e.data?.uri), + settings_values: e.data?.parameters, // for backward compat in UI +})) +``` + +#### getColumns.tsx + +Update column accessors: + +```typescript +// Before +dataIndex: "evaluator_key" + +// After +dataIndex: ["data", "uri"], +render: (uri) => extractEvaluatorKeyFromUri(uri) +``` + +**Deliverables:** +- [ ] Update ConfigureEvaluator form bindings +- [ ] Update commit logic to use new service functions +- [ ] Update useEvaluatorsRegistryData hook +- [ ] Update table columns in getColumns.tsx +- [ ] Update any other components that read evaluator configs + +--- + +### Phase 1.5: Testing + +**Test Cases:** 1. **List Evaluators** - [ ] Registry shows all existing evaluator configs - - [ ] Correct names, types, and icons displayed + - [ ] Correct names, types, icons displayed - [ ] Filtering and search work + - [ ] Archived evaluators hidden 2. **Create Evaluator** - - [ ] Select template → Configure → Commit - - [ ] Settings saved correctly - - [ ] Redirects to edit page after create + - [ ] Select template → Configure → Commit works + - [ ] Settings (parameters) saved correctly + - [ ] URI generated correctly from evaluator_key + - [ ] Slug generated from name 3. **Edit Evaluator** - - [ ] Load existing config - - [ ] Form populated with current values - - [ ] Update settings + - [ ] Load existing config into form + - [ ] Form populated with current values from `data.parameters` + - [ ] Update name and settings - [ ] Changes persisted 4. **Delete Evaluator** - - [ ] Delete confirmation works + - [ ] Archive endpoint called - [ ] Evaluator removed from list - [ ] No errors -5. **Test Evaluator** - - [ ] Load testcase - - [ ] Run variant - - [ ] Run evaluator +5. **Run Evaluator (legacy endpoint - still works)** + - [ ] Run evaluator button works + - [ ] Uses evaluator_key derived from URI - [ ] Results displayed correctly **Deliverables:** -- [ ] Test results document -- [ ] Bug fixes for any issues found -- [ ] Performance comparison (if applicable) +- [ ] Manual test all flows +- [ ] Fix any bugs found +- [ ] Document any edge cases -**Estimated Effort:** 2-3 days +--- + +### PR 1 Summary + +| Task | Files | Effort | +|------|-------|--------| +| Type definitions | `Types.ts` | 0.5 day | +| Service layer | `services/evaluators/index.ts` | 1 day | +| State/atoms | `state/evaluators/atoms.ts`, playground atoms | 1 day | +| Components | ConfigureEvaluator, Registry, columns | 1-2 days | +| Testing | Manual testing | 1 day | + +**Total PR 1 Effort:** 4-5 days --- -## Phase 4: Gradual Rollout (Low Risk) +## PR 2: Run Migration -**Goal:** Enable new endpoints for subset of users +**Goal:** Replace legacy `/evaluators/{key}/run` with native workflow invoke `/preview/workflows/invoke`. -### Tasks +**Prerequisite:** PR 1 merged and stable. -#### 4.1 Staged Rollout +### Phase 2.1: WorkflowService Types -1. **Internal testing:** Enable for team members only -2. **Beta users:** Enable for opt-in users -3. **General availability:** Enable for all users +**File:** `web/oss/src/lib/Types.ts` (add) + +```typescript +// ============ Workflow Service Types ============ + +export interface WorkflowServiceRequestData { + revision?: Record + parameters?: Record // evaluator settings + testcase?: Record + inputs?: Record // merged testcase data + trace?: Record + outputs?: any // prediction/output +} -#### 4.2 Monitoring +export interface WorkflowServiceInterface { + version?: string + uri?: string // e.g., "agenta:builtin:auto_exact_match:v0" + url?: string + headers?: Record + schemas?: Record +} -- Monitor error rates for evaluator operations -- Track API response times -- Watch for unexpected 404/500 errors +export interface WorkflowServiceConfiguration { + script?: Record + parameters?: Record +} -**Deliverables:** -- [ ] Rollout schedule -- [ ] Rollback procedure documented -- [ ] Monitoring dashboards/alerts +export interface WorkflowServiceRequest { + version?: string + flags?: Record + interface?: WorkflowServiceInterface + configuration?: WorkflowServiceConfiguration + data?: WorkflowServiceRequestData + references?: Record + links?: Record +} -**Estimated Effort:** 1-2 weeks (elapsed time) +export interface WorkflowServiceStatus { + code?: number + message?: string + type?: string + stacktrace?: string | string[] +} + +export interface WorkflowServiceResponseData { + outputs?: any +} + +export interface WorkflowServiceBatchResponse { + version?: string + trace_id?: string + span_id?: string + status?: WorkflowServiceStatus + data?: WorkflowServiceResponseData +} +``` --- -## Phase 5: Cleanup (Low Risk) +### Phase 2.2: Workflow Invoke Service -**Goal:** Remove legacy code and feature flag +**File:** `web/oss/src/services/workflows/invoke.ts` (new file) -### Tasks +```typescript +import axios from "@/oss/lib/api/assets/axiosConfig" +import { getAgentaApiUrl } from "@/oss/lib/helpers/utils" +import { getProjectValues } from "@/oss/contexts/project.context" +import { + WorkflowServiceRequest, + WorkflowServiceBatchResponse, + SimpleEvaluator, +} from "@/oss/lib/Types" + +export interface InvokeEvaluatorParams { + evaluator: SimpleEvaluator + inputs: Record // testcase data + any extra inputs + outputs: any // prediction/output from variant + parameters?: Record // override settings (optional) +} -#### 5.1 Remove Legacy Functions +/** + * Invoke an evaluator using native workflow service + */ +export const invokeEvaluator = async ( + params: InvokeEvaluatorParams +): Promise => { + const { projectId } = getProjectValues() + const { evaluator, inputs, outputs, parameters } = params + + const uri = evaluator.data?.uri + if (!uri) { + throw new Error("Evaluator has no URI configured") + } -- Remove `fetchAllEvaluatorConfigsLegacy` -- Remove `createEvaluatorConfigLegacy` -- Remove `updateEvaluatorConfigLegacy` -- Remove `deleteEvaluatorConfigLegacy` + const request: WorkflowServiceRequest = { + version: "2025.07.14", + interface: { + uri, + }, + configuration: { + parameters: parameters ?? evaluator.data?.parameters, + }, + data: { + inputs, + outputs, + parameters: parameters ?? evaluator.data?.parameters, + }, + } -#### 5.2 Remove Feature Flag + const response = await axios.post( + `${getAgentaApiUrl()}/preview/workflows/invoke?project_id=${projectId}`, + request, + ) -- Remove feature flag checks -- Clean up V2 suffix from function names + return response.data +} -#### 5.3 Update Documentation +/** + * Map workflow response to evaluator output format + */ +export function mapWorkflowResponseToEvaluatorOutput( + response: WorkflowServiceBatchResponse +): { outputs: Record } { + if (response.status?.code && response.status.code >= 400) { + throw new Error(response.status.message || "Evaluator execution failed") + } -- Update API documentation -- Update developer docs + return { + outputs: response.data?.outputs ?? {}, + } +} +``` + +--- + +### Phase 2.3: Update DebugSection + +**File:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx` + +Replace `createEvaluatorRunExecution` with `invokeEvaluator`: + +```typescript +// Before +const runResponse = await createEvaluatorRunExecution( + selectedEvaluator.key, + { + inputs: outputs, + settings: formValues.settings_values, + } +) + +// After +import { invokeEvaluator, mapWorkflowResponseToEvaluatorOutput } from "@/oss/services/workflows/invoke" + +const workflowResponse = await invokeEvaluator({ + evaluator: simpleEvaluator, // from playground state + inputs: { + ...testcaseData, + prediction: variantOutput, + }, + outputs: variantOutput, + parameters: formValues.parameters, // current form settings +}) + +const runResponse = mapWorkflowResponseToEvaluatorOutput(workflowResponse) +``` + +**Error Handling:** + +```typescript +try { + const workflowResponse = await invokeEvaluator(...) + + // Check for workflow-level errors + if (workflowResponse.status?.code && workflowResponse.status.code >= 400) { + message.error(workflowResponse.status.message || "Evaluator failed") + return + } + + const result = mapWorkflowResponseToEvaluatorOutput(workflowResponse) + setEvaluatorResult(result.outputs) + +} catch (error) { + message.error(getErrorMessage(error)) +} +``` + +--- + +### Phase 2.4: Update Evaluations Service (if needed) + +If other parts of the app use `createEvaluatorRunExecution`, update them too: + +**File:** `web/oss/src/services/evaluations/api_ee/index.ts` + +- Keep `createEvaluatorRunExecution` for now (batch evaluations may still use it via backend) +- Or deprecate and point to new invoke + +--- + +### Phase 2.5: Testing + +**Test Cases:** + +1. **Run Evaluator in Playground** + - [ ] Click "Run Evaluator" with testcase loaded + - [ ] Native invoke endpoint called + - [ ] Results displayed correctly + - [ ] Errors handled gracefully + +2. **Different Evaluator Types** + - [ ] Test exact_match evaluator + - [ ] Test regex evaluator + - [ ] Test AI critique evaluator (LLM-based) + - [ ] Test custom code evaluator + +3. **Error Scenarios** + - [ ] Invalid evaluator (no URI) + - [ ] Missing inputs + - [ ] Evaluator execution error + - [ ] Network error + +4. **Permissions** + - [ ] User with RUN_WORKFLOWS permission can run + - [ ] User without permission gets appropriate error **Deliverables:** -- [ ] Legacy code removed -- [ ] Feature flag removed -- [ ] Documentation updated -- [ ] PR for cleanup +- [ ] Manual test all evaluator types +- [ ] Fix any bugs found +- [ ] Verify error messages are user-friendly -**Estimated Effort:** 1 day +--- + +### PR 2 Summary + +| Task | Files | Effort | +|------|-------|--------| +| Workflow types | `Types.ts` | 0.5 day | +| Invoke service | `services/workflows/invoke.ts` | 0.5 day | +| DebugSection update | `DebugSection.tsx` | 1 day | +| Error handling | Various | 0.5 day | +| Testing | Manual testing | 1 day | + +**Total PR 2 Effort:** 3-4 days --- ## Timeline Summary -| Phase | Duration | Risk | Dependencies | -|-------|----------|------|--------------| -| Phase 1: Foundation | 1-2 days | Low | None | -| Phase 2: Feature Flag | 0.5 days | Low | Phase 1 | -| Phase 3: Integration Testing | 2-3 days | Medium | Phase 2, Backend PR merged | -| Phase 4: Gradual Rollout | 1-2 weeks | Low | Phase 3 | -| Phase 5: Cleanup | 1 day | Low | Phase 4 complete | +| PR | Tasks | Effort | Dependencies | +|----|-------|--------|--------------| +| PR 1: CRUD Migration | Types, services, atoms, components | 4-5 days | Backend PR #3527 merged | +| PR 2: Run Migration | Workflow types, invoke service, DebugSection | 3-4 days | PR 1 merged and stable | -**Total Implementation Time:** ~5-7 days -**Total Rollout Time:** ~2-3 weeks +**Total Implementation:** 7-9 days --- ## Rollback Plan -If issues are discovered after deployment: +### PR 1 Rollback +- Revert PR 1 commit +- Legacy endpoints still exist on backend for a period -1. **Immediate:** Set feature flag to `false` -2. **Short-term:** Deploy hotfix to disable new endpoints -3. **Investigation:** Analyze issues with new endpoints -4. **Resolution:** Fix and re-test before re-enabling +### PR 2 Rollback +- Revert PR 2 commit +- Fall back to legacy `/evaluators/{key}/run` (still supported) --- ## Open Questions -1. **Output Schema Generation:** Should the frontend generate output schemas when creating evaluators, or should the backend handle this? - - Current PR shows backend generates schemas during migration - - Frontend may need to include schema for new configs +1. **Slug uniqueness:** Does backend enforce unique slugs? If collision, does it auto-suffix? + +2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type? -2. **Slug Generation:** Should slugs be generated client-side or server-side? - - Server-side is safer (uniqueness checks) - - Client-side is faster (no round-trip) +3. **Permission model:** Is `RUN_WORKFLOWS` the right permission for evaluator playground? Or should there be `RUN_EVALUATORS`? -3. **Error Handling:** How should the frontend handle validation errors from new endpoints? - - New endpoints may return different error shapes - - Need to map to user-friendly messages +4. **Trace linking:** Should the playground display trace_id from workflow response for debugging? diff --git a/docs/design/migrate-evaluator-playground/risk-analysis.md b/docs/design/migrate-evaluator-playground/risk-analysis.md index 0bd037f0a0..3c522d441a 100644 --- a/docs/design/migrate-evaluator-playground/risk-analysis.md +++ b/docs/design/migrate-evaluator-playground/risk-analysis.md @@ -25,9 +25,10 @@ interface EvaluatorConfig { - `playgroundEditValuesAtom` is read throughout ConfigureEvaluator and DebugSection - Form initialization relies on `settings_values` property name -**Mitigation:** -- Create adapter functions to convert between `SimpleEvaluator` and internal state -- Or update atoms to use `SimpleEvaluator` shape and update all consumers +**Mitigation (PR 1):** +- Update atoms to use `SimpleEvaluator` shape directly +- Add derived atoms for backward-compatible access (e.g., `evaluator_key` from URI) +- Update all atom consumers in ConfigureEvaluator and DebugSection --- @@ -53,9 +54,10 @@ if (editMode && editEvalEditValues) { - Changing to `data.parameters` would break form binding - DynamicFormField components use `["settings_values", field.key]` name paths -**Mitigation:** -- Keep internal form structure as `settings_values` -- Transform on API boundary (adapter pattern) +**Mitigation (PR 1):** +- Update form field names from `settings_values` to `parameters` +- Update DynamicFormField name paths +- Update form.getFieldsValue() to extract `parameters` --- @@ -81,10 +83,10 @@ return axios.put(`/evaluators/configs/${configId}?project_id=${projectId}`, conf - Need to update URLs and payload transformation - Response handling needs to unwrap `{ evaluator: ... }` wrapper -**Mitigation:** -- Create new service functions for new endpoints -- Keep old functions temporarily for gradual migration -- Add response/request transformers +**Mitigation (PR 1):** +- Replace all service functions with new implementations +- New functions build `SimpleEvaluator` payloads directly +- Handle response wrapper in service layer --- @@ -106,9 +108,10 @@ const {evaluatorConfigs} = useFetchEvaluatorsData() - Tag cells, type pills depend on config shape - Filtering/search operates on legacy property names -**Mitigation:** -- Update hook to handle new `SimpleEvaluator` shape -- Transform data at fetch boundary, keep internal shape consistent +**Mitigation (PR 1):** +- Update hook to work with `SimpleEvaluator[]` +- Derive `evaluator_key` from `data.uri` for display +- Update column accessors in getColumns.tsx --- @@ -116,12 +119,11 @@ const {evaluatorConfigs} = useFetchEvaluatorsData() **Location:** `web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection.tsx` -**Risk Level:** LOW +**Risk Level:** MEDIUM (PR 2) -The evaluator run uses `evaluator_key` directly: +The evaluator run uses legacy endpoint: ```typescript -// Line 456 const runResponse = await createEvaluatorRunExecution( selectedEvaluator.key, // evaluator_key { inputs: outputs, settings: ... } @@ -129,13 +131,14 @@ const runResponse = await createEvaluatorRunExecution( ``` **Impact:** -- This endpoint (`/evaluators/{key}/run/`) remains unchanged -- Uses `selectedEvaluator.key` from template, not config -- No direct coupling to `EvaluatorConfig` shape +- Must migrate to `/preview/workflows/invoke` +- Need to construct `WorkflowServiceRequest` +- Different error handling (workflow status vs HTTP errors) -**Mitigation:** -- No changes needed for run functionality -- Keep using evaluator templates for the `key` value +**Mitigation (PR 2):** +- Create new `invokeEvaluator()` service function +- Build `WorkflowServiceRequest` with URI from `SimpleEvaluator.data.uri` +- Map workflow response/errors to UI --- @@ -160,10 +163,10 @@ const evaluatorConfigsQueryAtomFamily = atomFamily((projectId) => - Multiple components may depend on these atoms - Changing shape could cascade through application -**Mitigation:** -- Update query functions to use new endpoints -- Transform response at query boundary to maintain internal shape -- Or update all consumers to handle new shape +**Mitigation (PR 1):** +- Update service function to return `SimpleEvaluator[]` +- Update all consumers to handle new shape +- Change in one place (service), ripple through atoms automatically --- @@ -175,7 +178,7 @@ const evaluatorConfigsQueryAtomFamily = atomFamily((projectId) => The frontend distinguishes between: - **Evaluator templates** (`Evaluator`): Built-in evaluator definitions with `settings_template` -- **Evaluator configs** (`EvaluatorConfig`): User-created configurations with `settings_values` +- **Evaluator configs** (`SimpleEvaluator`): User-created configurations with `data.parameters` **Impact:** - This distinction is maintained in the new system @@ -184,26 +187,27 @@ The frontend distinguishes between: **Mitigation:** - No conceptual change needed +- Templates API unchanged - Just update config handling --- ## Risk Summary Table -| Component | Risk Level | Complexity | Priority | -|-----------|-----------|------------|----------| -| Service Layer | LOW-MEDIUM | LOW | HIGH (change first) | -| State Atoms | MEDIUM | MEDIUM | HIGH | -| ConfigureEvaluator Form | MEDIUM | MEDIUM | MEDIUM | -| Evaluators Registry | MEDIUM | MEDIUM | MEDIUM | -| Debug Section | LOW | LOW | LOW | -| Global Query Atoms | MEDIUM | LOW | MEDIUM | +| Component | Risk Level | PR | Priority | +|-----------|-----------|-----|----------| +| Service Layer | LOW-MEDIUM | PR 1 | HIGH (change first) | +| State Atoms | MEDIUM | PR 1 | HIGH | +| ConfigureEvaluator Form | MEDIUM | PR 1 | MEDIUM | +| Evaluators Registry | MEDIUM | PR 1 | MEDIUM | +| Global Query Atoms | MEDIUM | PR 1 | MEDIUM | +| Debug Section (Run) | MEDIUM | PR 2 | MEDIUM | ## Concrete Breakage Scenarios ### Scenario 1: Form Submission Fails -**Trigger:** Change `settings_values` to `data.parameters` without updating form +**Trigger:** Form still uses `settings_values` but service expects `parameters` **Symptoms:** - Form submits but settings are lost @@ -211,14 +215,15 @@ The frontend distinguishes between: - Evaluator created but doesn't work **Prevention:** -- Transform at API boundary, not in form +- Update form field names to `parameters` - Test form submission with real backend +- Verify payload in network tab --- ### Scenario 2: Evaluator List Empty -**Trigger:** Query endpoint returns new shape, UI expects old +**Trigger:** Query endpoint returns `SimpleEvaluator[]`, UI expects `EvaluatorConfig[]` **Symptoms:** - Evaluators registry shows empty list @@ -226,15 +231,15 @@ The frontend distinguishes between: - Console shows undefined property access **Prevention:** -- Update data transformation in hook -- Add null checks and fallbacks +- Update all components to use `SimpleEvaluator` shape +- Add null checks for `data?.uri`, `data?.parameters` - Log transformation errors --- ### Scenario 3: Edit Mode Fails to Load -**Trigger:** `playgroundEditValuesAtom` receives `SimpleEvaluator`, expects `EvaluatorConfig` +**Trigger:** Component expects `settings_values`, receives `data.parameters` **Symptoms:** - Navigate to edit page, form is empty @@ -242,7 +247,7 @@ The frontend distinguishes between: - Save overwrites with empty config **Prevention:** -- Transform at atom level +- Update form initialization to read from `data.parameters` - Test edit flow with existing configs --- @@ -262,26 +267,54 @@ The frontend distinguishes between: --- +### Scenario 5: Evaluator Run Fails (PR 2) + +**Trigger:** Workflow invoke returns different response shape + +**Symptoms:** +- Run button shows error +- Results not displayed +- Console shows parsing errors + +**Prevention:** +- Map `WorkflowServiceBatchResponse` to expected output format +- Handle `status.code` errors from workflow response +- Test with all evaluator types + +--- + ## Recommended Testing Strategy -### Unit Tests -- [ ] Service layer transformers (old shape ↔ new shape) +### PR 1 Testing + +**Unit Tests:** - [ ] URI parsing (`agenta:builtin:key:v0` → `key`) - [ ] Slug generation from name +- [ ] Service function request/response handling -### Integration Tests +**Integration Tests:** - [ ] Create evaluator config flow - [ ] Edit evaluator config flow - [ ] Delete (archive) evaluator config flow - [ ] List/query evaluator configs flow -### E2E Tests +**E2E Tests:** - [ ] Full playground flow: select template → configure → test → commit - [ ] Edit existing evaluator configuration - [ ] Clone evaluator configuration - [ ] Delete evaluator configuration -### Regression Tests -- [ ] Evaluator run still works -- [ ] Batch evaluations still work (use config IDs) -- [ ] Existing configs load correctly after migration +### PR 2 Testing + +**Unit Tests:** +- [ ] `WorkflowServiceRequest` construction +- [ ] Response mapping to evaluator output format +- [ ] Error status handling + +**Integration Tests:** +- [ ] Run evaluator with different types (exact_match, regex, AI critique) +- [ ] Error scenarios (invalid inputs, missing outputs) + +**Regression Tests:** +- [ ] Existing configs load correctly +- [ ] Batch evaluations still work (they use backend workflow invoke) diff --git a/docs/design/migrate-evaluator-playground/status.md b/docs/design/migrate-evaluator-playground/status.md index e0f32606eb..b566579b5d 100644 --- a/docs/design/migrate-evaluator-playground/status.md +++ b/docs/design/migrate-evaluator-playground/status.md @@ -1,11 +1,22 @@ # Status: Evaluator Playground Migration -## Current Phase: Research Complete +## Current Phase: Planning Complete **Last Updated:** 2026-01-27 --- +## Chosen Approach + +**Direct Migration (No Adapters)** - Split into two PRs: + +1. **PR 1:** CRUD migration to `SimpleEvaluator` endpoints +2. **PR 2:** Run migration to native workflow invoke + +See [plan.md](./plan.md) for detailed implementation steps. + +--- + ## Progress Summary ### Completed @@ -37,41 +48,45 @@ - Service layer coupling (LOW-MEDIUM risk) - Created risk mitigation strategies -- [x] Propose migration plan - - Adapter pattern approach - - Feature flag integration - - Phased rollout strategy +- [x] Finalize migration plan + - Chose direct migration (no adapters) + - Split into PR 1 (CRUD) and PR 2 (Run) + - Documented all file changes needed -### In Progress +### Next Steps -- [ ] Phase 1: Foundation - Not started +- [ ] Wait for PR #3527 to be merged +- [ ] Start PR 1: CRUD migration +- [ ] After PR 1 stable, start PR 2: Run migration -### Blocked +--- -- [ ] Phase 3: Integration Testing - Blocked on PR #3527 merge +## Key Decisions + +| Decision | Rationale | Date | +|----------|-----------|------| +| Direct migration (no adapters) | Avoids tech debt, aligns with new architecture | 2026-01-27 | +| Two-PR approach | Keeps changes reviewable, allows CRUD to stabilize first | 2026-01-27 | +| Internal shapes become `SimpleEvaluator` | Matches backend model, no translation layer | 2026-01-27 | --- ## Key Findings -### 1. The `/evaluators/{key}/run/` endpoint works but is now a wrapper +### 1. The `/evaluators/{key}/run/` endpoint is a thin wrapper -**Important Discovery:** PR #3527 refactored the legacy run endpoint to use the native handler registry internally: +PR #3527 refactored the legacy run endpoint to use the native handler registry internally: - It builds a URI from the evaluator_key: `agenta:builtin:{key}:v0` - Uses `retrieve_handler(uri)` to get the actual handler function - Directly invokes the handler -**Implication:** The external interface is unchanged, but internally it uses the new architecture. - ### 2. Native workflow invoke path exists There's a fully native way to run evaluators: - Endpoint: `POST /preview/workflows/invoke` -- Uses `WorkflowServiceRequest` with URI in revision data +- Uses `WorkflowServiceRequest` with URI in interface - Same mechanism used by batch evaluations -**Recommendation:** Keep using legacy endpoint for now (simpler), consider native invoke for future custom evaluator support. - ### 3. URI-based handler registry The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions: @@ -79,48 +94,34 @@ The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions: - Supports custom evaluators: `user:custom:my_eval:latest` - Enables version management of evaluator implementations -### 4. Adapter pattern minimizes risk - -By transforming data at the API boundary, we can: -- Keep internal data shapes unchanged -- Minimize code changes -- Enable easy rollback via feature flag - -### 5. Output schema handling +### 4. Key mapping changes -The new `SimpleEvaluator` model includes explicit output schemas. The backend migration generates these from evaluator settings. For new configs: -- Built-in evaluators: Schema can be derived from evaluator type -- Custom evaluators: Schema should be provided by user +| Legacy | New | +|--------|-----| +| `evaluator_key` | derived from `data.uri` | +| `settings_values` | `data.parameters` | +| `EvaluatorConfig` | `SimpleEvaluator` | --- -## Decisions Made - -| Decision | Rationale | Date | -|----------|-----------|------| -| Use adapter pattern | Minimizes changes to internal code, enables gradual migration | 2026-01-27 | -| Feature flag approach | Allows gradual rollout and easy rollback | 2026-01-27 | -| Keep form structure as `settings_values` | Avoid cascading changes to form components | 2026-01-27 | +## Open Questions ---- +1. **Slug uniqueness:** Does backend enforce unique slugs? If collision, does it auto-suffix? -## Open Questions +2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type? -1. **Run migration target:** For full migration, do we want the playground to invoke by: - - built-in key -> URI (`agenta:builtin:{key}:v0`), or - - evaluator revision URI stored on `SimpleEvaluator.data.uri` (preferred), or - - a specific evaluator revision id (even more explicit)? -2. **Output Schema:** Confirm whether frontend must provide `data.schemas.outputs` on create/edit, or backend will derive defaults. -3. **Slug Generation:** Client-side or server-side? +3. **Permission model:** Is `RUN_WORKFLOWS` the right permission for evaluator playground? Or should there be `RUN_EVALUATORS`? --- -## Next Steps +## Effort Estimates + +| PR | Effort | Dependencies | +|----|--------|--------------| +| PR 1: CRUD Migration | 4-5 days | Backend PR #3527 merged | +| PR 2: Run Migration | 3-4 days | PR 1 merged and stable | -1. Wait for PR #3527 to be merged -2. Start Phase 1: Create type definitions and adapters -3. Add feature flag infrastructure -4. Test with new endpoints +**Total:** 7-9 days implementation --- @@ -130,5 +131,7 @@ The new `SimpleEvaluator` model includes explicit output schemas. The backend mi - [context.md](./context.md) - Background and goals - [current-system.md](./current-system.md) - Current implementation details - [new-endpoints.md](./new-endpoints.md) - New endpoint documentation +- [research.md](./research.md) - Handler registry and execution research +- [migration-options.md](./migration-options.md) - Why we chose direct migration - [risk-analysis.md](./risk-analysis.md) - Coupling and risk analysis -- [plan.md](./plan.md) - Migration execution plan +- [plan.md](./plan.md) - Detailed implementation plan From e3e633d75ad7bec6f8f1fb9a3833b0ffa8d4dba3 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Wed, 28 Jan 2026 12:51:51 +0100 Subject: [PATCH 3/4] feat(frontend): migrate evaluator configs CRUD --- .../new-endpoints.md | 8 +- .../migrate-evaluator-playground/plan.md | 38 ++++-- .../migrate-evaluator-playground/status.md | 7 +- .../src/components/Evaluators/assets/types.ts | 4 +- .../src/components/Evaluators/assets/utils.ts | 38 ++++-- .../components/ConfigureEvaluator/index.tsx | 3 +- .../hooks/useEvaluatorsRegistryData.ts | 4 +- .../Components/NewEvaluationModalInner.tsx | 3 +- .../SelectEvaluatorSection.tsx | 52 ++++---- .../pages/evaluations/NewEvaluation/types.ts | 6 +- .../ConfigureEvaluator/AdvancedSettings.tsx | 2 +- .../ConfigureEvaluator/DebugSection.tsx | 6 +- .../ConfigureEvaluator/DynamicFormField.tsx | 2 +- .../ConfigureEvaluator/FieldsTagsEditor.tsx | 2 +- .../ConfigureEvaluator/index.tsx | 119 ++++++++--------- .../ConfigureEvaluator/state/atoms.ts | 12 +- .../Evaluators/DeleteModal.tsx | 4 +- .../Evaluators/EvaluatorCard.tsx | 20 +-- .../Evaluators/EvaluatorList.tsx | 23 ++-- .../EvaluatorsModal/Evaluators/index.tsx | 18 ++- .../EvaluatorsModal/EvaluatorsModal.tsx | 4 +- .../OnlineEvaluationDrawer.tsx | 2 +- .../hooks/useEvaluatorDetails.ts | 36 +++++- .../hooks/useEvaluatorSelection.tsx | 31 +++-- .../hooks/useEvaluatorTypeFromConfigs.ts | 7 +- .../utils/evaluatorDetails.ts | 10 +- web/oss/src/lib/Types.ts | 70 ++++++++++ web/oss/src/lib/atoms/evaluation.ts | 4 +- web/oss/src/lib/evaluators/utils.ts | 80 ++++++++++++ .../lib/hooks/useEvaluatorConfigs/index.ts | 4 +- web/oss/src/services/evaluations/api/index.ts | 2 +- web/oss/src/services/evaluators/index.ts | 120 ++++++++++++++---- web/oss/src/state/evaluators/atoms.ts | 24 ++-- 33 files changed, 530 insertions(+), 235 deletions(-) create mode 100644 web/oss/src/lib/evaluators/utils.ts diff --git a/docs/design/migrate-evaluator-playground/new-endpoints.md b/docs/design/migrate-evaluator-playground/new-endpoints.md index 05231c4813..97a20f01b2 100644 --- a/docs/design/migrate-evaluator-playground/new-endpoints.md +++ b/docs/design/migrate-evaluator-playground/new-endpoints.md @@ -259,6 +259,8 @@ Response: SimpleEvaluatorsResponse } ``` +**Note:** For the Evaluator Registry (automatic configs), pass `flags.is_human = false` and `include_archived = false` so archived or human evaluators don't show up. + ### Create Evaluator Config **Old:** @@ -284,7 +286,7 @@ Request: SimpleEvaluatorCreateRequest evaluator: { slug: string # Generated from name name: string - flags: { is_evaluator: true } + flags: { is_evaluator: true, is_human: false } data: { uri: "agenta:builtin:{evaluator_key}:v0" parameters: object # settings_values @@ -300,6 +302,8 @@ Response: SimpleEvaluatorResponse } ``` +**Note:** Workflow slugs are unique per project. We append a short random suffix when generating slugs to avoid collisions when names repeat. + ### Update Evaluator Config **Old:** @@ -333,6 +337,8 @@ Request: SimpleEvaluatorEditRequest Response: SimpleEvaluatorResponse ``` +**Note:** `SimpleEvaluatorEdit.data` is treated as the full revision payload. When updating, include the existing `data.uri` (and any schemas) along with `data.parameters` to avoid clearing the URI. + ### Delete Evaluator Config **Old:** diff --git a/docs/design/migrate-evaluator-playground/plan.md b/docs/design/migrate-evaluator-playground/plan.md index a234ec2111..8a384658f9 100644 --- a/docs/design/migrate-evaluator-playground/plan.md +++ b/docs/design/migrate-evaluator-playground/plan.md @@ -139,14 +139,17 @@ export function buildEvaluatorUri(evaluatorKey: string): string { } /** - * Generate slug from name + * Generate slug from name (append suffix to avoid collisions) */ export function generateSlug(name: string): string { - return name + const base = name .toLowerCase() .replace(/[^a-z0-9]+/g, "-") .replace(/^-|-$/g, "") - .substring(0, 50) // limit length + + const suffix = Math.random().toString(36).slice(2, 8) + const maxBaseLength = Math.max(1, 50 - suffix.length - 1) + return `${base.slice(0, maxBaseLength)}-${suffix}` } // ============ CRUD Functions ============ @@ -162,7 +165,10 @@ export const fetchAllEvaluatorConfigs = async ( const response = await axios.post( `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`, - { evaluator: { flags: { is_evaluator: true } } } + { + evaluator: { flags: { is_evaluator: true, is_human: false } }, + include_archived: false, + } ) return response.data?.evaluators || [] @@ -178,7 +184,7 @@ export const createEvaluatorConfig = async ( const payload: SimpleEvaluatorCreate = { slug: generateSlug(name), name, - flags: { is_evaluator: true }, + flags: { is_evaluator: true, is_human: false }, data: { uri: buildEvaluatorUri(evaluatorKey), parameters: settingsValues, @@ -199,15 +205,21 @@ export const createEvaluatorConfig = async ( export const updateEvaluatorConfig = async ( evaluatorId: string, updates: { name?: string; settingsValues?: Record }, + existing?: SimpleEvaluator, ): Promise => { const {projectId} = getProjectValues() + // IMPORTANT: include existing data (uri/schemas) when editing const payload: SimpleEvaluatorEdit = { id: evaluatorId, - name: updates.name, - data: updates.settingsValues - ? { parameters: updates.settingsValues } - : undefined, + name: updates.name ?? existing?.name, + data: { + ...(existing?.data ?? {}), + ...(updates.settingsValues ? {parameters: updates.settingsValues} : {}), + }, + tags: existing?.tags, + meta: existing?.meta, + flags: existing?.flags, } const response = await axios.put( @@ -338,7 +350,7 @@ form.setFieldsValue({ settings_values: editEvalEditValues.settings_values, }) -// After +// After (use parameters field to match SimpleEvaluator) form.setFieldsValue({ name: simpleEvaluator.name, parameters: simpleEvaluator.data?.parameters, @@ -354,7 +366,7 @@ Update to work with `SimpleEvaluator[]`: const enrichedEvaluators = evaluators.map((e) => ({ ...e, evaluator_key: extractEvaluatorKeyFromUri(e.data?.uri), - settings_values: e.data?.parameters, // for backward compat in UI + parameters: e.data?.parameters, })) ``` @@ -588,7 +600,7 @@ const runResponse = await createEvaluatorRunExecution( selectedEvaluator.key, { inputs: outputs, - settings: formValues.settings_values, + settings: formValues.parameters, } ) @@ -713,7 +725,7 @@ If other parts of the app use `createEvaluatorRunExecution`, update them too: ## Open Questions -1. **Slug uniqueness:** Does backend enforce unique slugs? If collision, does it auto-suffix? +1. **Slug uniqueness:** Backend enforces unique slugs per project; generate a short suffix client-side to avoid collisions. 2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type? diff --git a/docs/design/migrate-evaluator-playground/status.md b/docs/design/migrate-evaluator-playground/status.md index b566579b5d..dbce737e8f 100644 --- a/docs/design/migrate-evaluator-playground/status.md +++ b/docs/design/migrate-evaluator-playground/status.md @@ -1,6 +1,6 @@ # Status: Evaluator Playground Migration -## Current Phase: Planning Complete +## Current Phase: PR 1 (CRUD) In Progress **Last Updated:** 2026-01-27 @@ -55,8 +55,7 @@ See [plan.md](./plan.md) for detailed implementation steps. ### Next Steps -- [ ] Wait for PR #3527 to be merged -- [ ] Start PR 1: CRUD migration +- [ ] Complete PR 1: CRUD migration (stacked on PR #3527) - [ ] After PR 1 stable, start PR 2: Run migration --- @@ -106,7 +105,7 @@ The SDK maintains a `HANDLER_REGISTRY` that maps URIs to handler functions: ## Open Questions -1. **Slug uniqueness:** Does backend enforce unique slugs? If collision, does it auto-suffix? +1. **Slug uniqueness:** Backend enforces unique slugs per project; generate a short suffix client-side to avoid collisions. 2. **Output schemas:** Should frontend pass `data.schemas.outputs` when creating? Or does backend derive from evaluator type? diff --git a/web/oss/src/components/Evaluators/assets/types.ts b/web/oss/src/components/Evaluators/assets/types.ts index f928cdc801..ccfdfaaa06 100644 --- a/web/oss/src/components/Evaluators/assets/types.ts +++ b/web/oss/src/components/Evaluators/assets/types.ts @@ -1,5 +1,5 @@ import {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" export type EvaluatorCategory = "automatic" | "human" @@ -15,7 +15,7 @@ export type EvaluatorPreview = EvaluatorPreviewDto & { metrics?: Record } -export type EvaluatorConfigRow = EvaluatorConfig & { +export type EvaluatorConfigRow = SimpleEvaluator & { evaluator?: Evaluator | null kind?: "config" } diff --git a/web/oss/src/components/Evaluators/assets/utils.ts b/web/oss/src/components/Evaluators/assets/utils.ts index 4b09fa2d46..a750ce248f 100644 --- a/web/oss/src/components/Evaluators/assets/utils.ts +++ b/web/oss/src/components/Evaluators/assets/utils.ts @@ -1,6 +1,7 @@ +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import {formatDay} from "@/oss/lib/helpers/dateTimeHelper" import {capitalize} from "@/oss/lib/helpers/utils" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import { EvaluatorCategory, @@ -54,7 +55,7 @@ const formatDate = (value?: string) => { return formatDay({date: value}) } -const collectConfigTags = (config: EvaluatorConfig, evaluator?: Evaluator | null) => { +const collectConfigTags = (config: SimpleEvaluator, evaluator?: Evaluator | null) => { const tags = new Set() if (Array.isArray(config.tags)) { @@ -132,11 +133,12 @@ export const transformEvaluatorsToRows = ( } const buildConfigTypeBadge = ( - config: EvaluatorConfig, + config: SimpleEvaluator, category: Extract, evaluator?: Evaluator | null, ): EvaluatorTypeBadge => { - const label = evaluator?.name || createTypeLabel(config.evaluator_key, config.name) + const evaluatorKey = resolveEvaluatorKey(config) + const label = evaluator?.name || createTypeLabel(evaluatorKey, config.name) const colorHex = config.color || evaluator?.color return { @@ -146,44 +148,54 @@ const buildConfigTypeBadge = ( } } -const extractConfigVersion = (config: EvaluatorConfig) => { - const serviceValues = (config.settings_values as any)?.service || {} +const extractConfigVersion = (config: SimpleEvaluator) => { + const parameters = (config.data as any)?.parameters || {} + const serviceValues = (config.data as any)?.service || {} + const serviceConfig = serviceValues?.configuration || {} const candidate = (config as any)?.version || serviceValues?.agenta || serviceValues?.version || - (config.settings_values as any)?.version || + serviceConfig?.version || + serviceConfig?.agenta || + parameters?.version || "" return sanitizeVersion(typeof candidate === "string" ? candidate : "") } -const extractConfigModifiedBy = (config: EvaluatorConfig) => { +const extractConfigModifiedBy = (config: SimpleEvaluator) => { const modifiedBy = (config as any)?.updated_by || (config as any)?.updatedBy || + (config as any)?.updated_by_id || + (config as any)?.updatedById || (config as any)?.created_by || (config as any)?.createdBy || + (config as any)?.created_by_id || + (config as any)?.createdById || "" return typeof modifiedBy === "string" ? modifiedBy : "" } export const transformEvaluatorConfigsToRows = ( - configs: EvaluatorConfig[], + configs: SimpleEvaluator[], category: Extract, evaluators: Evaluator[], ): EvaluatorRegistryRow[] => { const evaluatorsMap = new Map(evaluators.map((item) => [item.key, item])) return configs.map((config) => { - const evaluator = evaluatorsMap.get(config.evaluator_key) || null + const evaluatorKey = resolveEvaluatorKey(config) + const evaluator = evaluatorKey ? evaluatorsMap.get(evaluatorKey) || null : null const badge = buildConfigTypeBadge(config, category, evaluator) const versionLabel = extractConfigVersion(config) const tags = collectConfigTags(config, evaluator) const modifiedBy = extractConfigModifiedBy(config) const createdAt = config.created_at const updatedAt = config.updated_at || createdAt + const displayName = config.name || evaluator?.name || evaluatorKey || config.slug || "" const raw: EvaluatorConfigRow = { ...config, @@ -194,15 +206,15 @@ export const transformEvaluatorConfigsToRows = ( return { key: config.id, id: config.id, - name: config.name, - slug: config.evaluator_key, + name: displayName, + slug: evaluatorKey || config.slug, typeBadge: badge, versionLabel, tags, dateCreated: formatDate(createdAt), lastModified: formatDate(updatedAt), modifiedBy, - avatarName: modifiedBy || config.name, + avatarName: modifiedBy || displayName, raw, } }) diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx index ca07709a52..e1494219ab 100644 --- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx @@ -25,6 +25,7 @@ import { resetPlaygroundAtom, } from "@/oss/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms" import useURL from "@/oss/hooks/useURL" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" import {Evaluator} from "@/oss/lib/Types" import {evaluatorByKeyAtomFamily} from "@/oss/state/evaluators" @@ -63,7 +64,7 @@ const ConfigureEvaluatorPage = ({evaluatorId}: {evaluatorId?: string | null}) => ) }, [evaluatorConfigs, evaluatorId, stagedConfig]) - const evaluatorKey = existingConfig?.evaluator_key ?? evaluatorId ?? null + const evaluatorKey = resolveEvaluatorKey(existingConfig) ?? evaluatorId ?? null const evaluatorQuery = useAtomValue(evaluatorByKeyAtomFamily(evaluatorKey)) const evaluatorFromRegular = evaluators.find((item) => item.key === evaluatorKey) diff --git a/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts b/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts index 3aa171dc76..97fbb7ffc4 100644 --- a/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts +++ b/web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts @@ -2,7 +2,7 @@ import {useCallback, useMemo} from "react" import useEvaluators from "@/oss/lib/hooks/useEvaluators" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import {EvaluatorCategory, EvaluatorPreview, EvaluatorRegistryRow} from "../assets/types" import { @@ -33,7 +33,7 @@ const useEvaluatorsRegistryData = (category: EvaluatorCategory) => { const humanEvaluators = (humanEvaluatorsSwr.data || []) as EvaluatorPreview[] unsortedRows = transformEvaluatorsToRows(humanEvaluators, "human") } else { - const evaluatorConfigs = (evaluatorConfigsSwr.data || []) as EvaluatorConfig[] + const evaluatorConfigs = (evaluatorConfigsSwr.data || []) as SimpleEvaluator[] const baseEvaluators = (baseEvaluatorsSwr.data || []) as Evaluator[] unsortedRows = transformEvaluatorConfigsToRows( diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx index cffdfdcd23..fd64b589ec 100644 --- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx +++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx @@ -7,6 +7,7 @@ import {useRouter} from "next/router" import {message} from "@/oss/components/AppMessageContext" import useURL from "@/oss/hooks/useURL" import {useVaultSecret} from "@/oss/hooks/useVaultSecret" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import {redirectIfNoLLMKeys} from "@/oss/lib/helpers/utils" import useAppVariantRevisions from "@/oss/lib/hooks/useAppVariantRevisions" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" @@ -289,7 +290,7 @@ const NewEvaluationModalInner = ({ !preview && selectedEvalConfigs.some( (id) => - evaluatorConfigs.find((config) => config.id === id)?.evaluator_key === + resolveEvaluatorKey(evaluatorConfigs.find((config) => config.id === id)) === "auto_ai_critique", ) && (await redirectIfNoLLMKeys({secrets})) diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx index 3545f0b98a..b7bd3b649a 100644 --- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx +++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/SelectEvaluatorSection/SelectEvaluatorSection.tsx @@ -11,9 +11,10 @@ import router from "next/router" import {getMetricsFromEvaluator} from "@/oss/components/SharedDrawers/AnnotateDrawer/assets/transforms" import useURL from "@/oss/hooks/useURL" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import {EvaluatorDto} from "@/oss/lib/hooks/useEvaluators/types" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import {openEvaluatorDrawerAtom} from "../../../autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms" import type {SelectEvaluatorSectionProps} from "../../types" @@ -88,12 +89,12 @@ const SelectEvaluatorSection = ({ const evaluatorConfigs = useMemo(() => { if (preview) { - return evaluators as EvaluatorConfig[] + return [] as SimpleEvaluator[] } return ( propsEvaluatorConfigs?.length ? propsEvaluatorConfigs : evaluatorConfigsSwr.data || [] - ) as EvaluatorConfig[] - }, [preview, propsEvaluatorConfigs, evaluatorConfigsSwr.data, evaluators]) + ) as SimpleEvaluator[] + }, [preview, propsEvaluatorConfigs, evaluatorConfigsSwr.data]) const isLoadingEvaluators = fetchLoadingEvaluators const isLoadingEvaluatorConfigs = fetchLoadingConfigs @@ -122,7 +123,7 @@ const SelectEvaluatorSection = ({ const availableIds = new Set( (preview ? (evaluators as EvaluatorDto<"response">[]) - : (evaluatorConfigs as EvaluatorConfig[]) + : (evaluatorConfigs as SimpleEvaluator[]) ).map((config) => config.id), ) @@ -141,10 +142,9 @@ const SelectEvaluatorSection = ({ // Handler to open the drawer in edit mode const handleEditConfig = useCallback( - (record: EvaluatorConfig) => { - const evaluator = (evaluators as Evaluator[]).find( - (e) => e.key === record.evaluator_key, - ) + (record: SimpleEvaluator) => { + const evaluatorKey = resolveEvaluatorKey(record) + const evaluator = (evaluators as Evaluator[]).find((e) => e.key === evaluatorKey) if (evaluator) { openEvaluatorDrawer({ evaluator, @@ -158,10 +158,9 @@ const SelectEvaluatorSection = ({ // Handler to open the drawer in clone mode const handleCloneConfig = useCallback( - (record: EvaluatorConfig) => { - const evaluator = (evaluators as Evaluator[]).find( - (e) => e.key === record.evaluator_key, - ) + (record: SimpleEvaluator) => { + const evaluatorKey = resolveEvaluatorKey(record) + const evaluator = (evaluators as Evaluator[]).find((e) => e.key === evaluatorKey) if (evaluator) { openEvaluatorDrawer({ evaluator, @@ -203,13 +202,13 @@ const SelectEvaluatorSection = ({ [], ) - const columnsConfig: ColumnsType = useMemo( + const columnsConfig: ColumnsType = useMemo( () => [ { title: "Name", dataIndex: "name", key: "name", - render: (_, record: EvaluatorConfig) => { + render: (_, record: SimpleEvaluator) => { return
{record.name}
}, }, @@ -217,10 +216,11 @@ const SelectEvaluatorSection = ({ title: "Type", dataIndex: "type", key: "type", - render: (x, record: EvaluatorConfig) => { + render: (x, record: SimpleEvaluator) => { // Find the evaluator by key to display its name + const evaluatorKey = resolveEvaluatorKey(record) const evaluator = (evaluators as Evaluator[]).find( - (item) => item.key === record.evaluator_key, + (item) => item.key === evaluatorKey, ) return {evaluator?.name} }, @@ -231,7 +231,7 @@ const SelectEvaluatorSection = ({ width: 56, fixed: "right", align: "center", - render: (_, record: EvaluatorConfig) => { + render: (_, record: SimpleEvaluator) => { return ( ({ // Conditionally type filteredEvalConfigs based on Preview const filteredEvalConfigs: Preview extends true ? EvaluatorDto<"response">[] - : EvaluatorConfig[] = useMemo(() => { + : SimpleEvaluator[] = useMemo(() => { if (preview) { // Explicitly narrow types for Preview = true (human evaluations) let data = evaluators as EvaluatorDto<"response">[] @@ -295,21 +295,21 @@ const SelectEvaluatorSection = ({ if (!searchTerm) return data as any return data.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), + (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()), ) as any } else { // Explicitly narrow types for Preview = false - const data = evaluatorConfigs as EvaluatorConfig[] + const data = evaluatorConfigs as SimpleEvaluator[] if (!searchTerm) return data return data.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), + (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()), ) as any } }, [searchTerm, evaluatorConfigs, preview, evaluators]) const onSelectEvalConfig = (selectedRowKeys: React.Key[]) => { const currentSelected = new Set(selectedEvalConfigs) - const configs = filteredEvalConfigs as EvaluatorDto<"response">[] + const configs = filteredEvalConfigs as {id: string}[] configs.forEach((item) => { if (selectedRowKeys.includes(item.id)) { currentSelected.add(item.id) @@ -331,7 +331,7 @@ const SelectEvaluatorSection = ({ ).length > 0 ) } - return (evaluatorConfigs as EvaluatorConfig[]).length > 0 + return (evaluatorConfigs as SimpleEvaluator[]).length > 0 }, [preview, evaluators, evaluatorConfigs]) return ( @@ -418,7 +418,7 @@ const SelectEvaluatorSection = ({ pagination={false} /> ) : ( - + rowSelection={{ type: "checkbox", columnWidth: 48, @@ -442,7 +442,7 @@ const SelectEvaluatorSection = ({ className="ph-no-capture" columns={columnsConfig} rowKey={"id"} - dataSource={filteredEvalConfigs as EvaluatorConfig[]} + dataSource={filteredEvalConfigs as SimpleEvaluator[]} scroll={{x: true, y: 455}} bordered pagination={false} diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts b/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts index a068971bc4..5f838f8665 100644 --- a/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts +++ b/web/oss/src/components/pages/evaluations/NewEvaluation/types.ts @@ -4,7 +4,7 @@ import {ModalProps} from "antd" import {EvaluatorDto} from "@/oss/lib/hooks/useEvaluators/types" import {EnhancedVariant} from "@/oss/lib/shared/variant/transformer/types" -import {LLMRunRateLimit, Evaluator, EvaluatorConfig, testset} from "@/oss/lib/Types" +import {LLMRunRateLimit, Evaluator, SimpleEvaluator, testset} from "@/oss/lib/Types" export interface NewEvaluationAppOption { label: string @@ -54,7 +54,7 @@ export interface NewEvaluationModalContentProps extends HTMLProps[] - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] advanceSettings: LLMRunRateLimitWithCorrectAnswer setAdvanceSettings: Dispatch> appOptions: NewEvaluationAppOption[] @@ -95,7 +95,7 @@ export interface SelectTestsetSectionProps extends HTMLProps { } export interface SelectEvaluatorSectionProps extends HTMLProps { - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] evaluators: Evaluator[] selectedEvalConfigs: string[] setSelectedEvalConfigs: Dispatch> diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx index 6957d3438a..6a0aed5f8f 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx @@ -71,7 +71,7 @@ const AdvancedSettings: React.FC = ({settings, selectedTe return ( { setEvalOutputStatus({success: false, error: false}) setIsLoadingResult(true) - const settingsValues = form.getFieldValue("settings_values") || {} - let normalizedSettings = {...settingsValues} + const parameters = form.getFieldValue("parameters") || {} + let normalizedSettings = {...parameters} if (typeof normalizedSettings.json_schema === "string") { try { @@ -419,7 +419,7 @@ const DebugSection = () => { } if (!selectedEvaluator.key.startsWith("rag_")) { - const correctAnswerKey = settingsValues.correct_answer_key + const correctAnswerKey = parameters.correct_answer_key const groundTruthKey = typeof correctAnswerKey === "string" && correctAnswerKey.startsWith("testcase.") ? correctAnswerKey.split(".")[1] diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx index a8128c43e7..c7a3df73f6 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx @@ -105,7 +105,7 @@ export const DynamicFormField: React.FC = ({ form, }) => { const settingsValue = Form.useWatch(name, form) - const runtime = Form.useWatch(["settings_values", "runtime"], form) + const runtime = Form.useWatch(["parameters", "runtime"], form) const classes = useStyles() const {token} = theme.useToken() diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx index a96a07a37f..f5ddf000df 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx @@ -55,7 +55,7 @@ export const FieldsTagsEditor: React.FC = ({ // Watch the correct_answer_key from form to react to changes // Using Form.useWatch instead of form.getFieldValue for reactivity - const formCorrectAnswerKey = Form.useWatch(["settings_values", "correct_answer_key"], form) + const formCorrectAnswerKey = Form.useWatch(["parameters", "correct_answer_key"], form) const effectiveKey = formCorrectAnswerKey || correctAnswerKey // Check if we can detect fields from testcase diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 331afe0852..1454b99565 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -13,7 +13,7 @@ import {useAppId} from "@/oss/hooks/useAppId" import useURL from "@/oss/hooks/useURL" import {EvaluationSettingsTemplate, JSSTheme, SettingsPreset} from "@/oss/lib/Types" import { - CreateEvaluationConfigData, + CreateEvaluatorConfigData, createEvaluatorConfig, updateEvaluatorConfig, } from "@/oss/services/evaluations/api" @@ -69,6 +69,13 @@ interface ConfigureEvaluatorProps { onToggleTestPanel?: () => void } +interface ConfigureEvaluatorFormValues { + name: string + description?: string + tags?: string[] + parameters?: Record +} + const useStyles = createUseStyles((theme: JSSTheme) => ({ collapseContainer: { "& .ant-collapse-header": { @@ -199,12 +206,10 @@ const ConfigureEvaluator = ({ const allKeys = Array.from(new Set([...templateKeys, ...presetKeys])) // Clear subtree before applying new values to avoid stale keys - form.setFieldsValue({settings_values: {}}) + form.setFieldsValue({parameters: {}}) if (allKeys.length) { - const fieldNames = allKeys.map( - (key) => ["settings_values", key] as (string | number)[], - ) + const fieldNames = allKeys.map((key) => ["parameters", key] as (string | number)[]) form.resetFields(fieldNames) const nextFields = fieldNames @@ -248,7 +253,7 @@ const ConfigureEvaluator = ({ const evaluatorVersionNumber = useMemo(() => { const raw = - editEvalEditValues?.settings_values?.version ?? + editEvalEditValues?.data?.parameters?.version ?? selectedEvaluator?.settings_template?.version?.default ?? 3 @@ -256,7 +261,7 @@ const ConfigureEvaluator = ({ // extract leading number (e.g., "4", "4.1", "v4") const match = String(raw).match(/\d+(\.\d+)?/) return match ? parseFloat(match[0]) : 3 - }, [editEvalEditValues?.settings_values?.version, selectedEvaluator]) + }, [editEvalEditValues?.data?.parameters?.version, selectedEvaluator]) const evalFields = useMemo(() => { const templateEntries = Object.entries(selectedEvaluator?.settings_template || {}) @@ -283,28 +288,25 @@ const ConfigureEvaluator = ({ const advancedSettingsFields = evalFields.filter((field) => field.advanced) const basicSettingsFields = evalFields.filter((field) => !field.advanced) - const onSubmit = async (values: CreateEvaluationConfigData) => { + const onSubmit = async (values: ConfigureEvaluatorFormValues) => { try { setSubmitLoading(true) if (!selectedEvaluator?.key) throw new Error("No selected key") - const settingsValues = values.settings_values || {} + const parameters = values.parameters || {} - const jsonSchemaFieldPath: (string | number)[] = ["settings_values", "json_schema"] - const hasJsonSchema = Object.prototype.hasOwnProperty.call( - settingsValues, - "json_schema", - ) + const jsonSchemaFieldPath: (string | number)[] = ["parameters", "json_schema"] + const hasJsonSchema = Object.prototype.hasOwnProperty.call(parameters, "json_schema") if (hasJsonSchema) { form.setFields([{name: jsonSchemaFieldPath, errors: []}]) - if (typeof settingsValues.json_schema === "string") { + if (typeof parameters.json_schema === "string") { try { - const parsed = JSON.parse(settingsValues.json_schema) + const parsed = JSON.parse(parameters.json_schema) if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { throw new Error() } - settingsValues.json_schema = parsed + parameters.json_schema = parsed } catch { form.setFields([ { @@ -315,9 +317,9 @@ const ConfigureEvaluator = ({ throw new Error("JSON schema must be a valid JSON object") } } else if ( - settingsValues.json_schema && - (typeof settingsValues.json_schema !== "object" || - Array.isArray(settingsValues.json_schema)) + parameters.json_schema && + (typeof parameters.json_schema !== "object" || + Array.isArray(parameters.json_schema)) ) { form.setFields([ { @@ -329,40 +331,43 @@ const ConfigureEvaluator = ({ } } - const data = { - ...values, - evaluator_key: selectedEvaluator!.key, - settings_values: settingsValues, + const existingParameters = editEvalEditValues?.data?.parameters || {} + const mergedParameters = {...existingParameters, ...parameters} + + const payload: CreateEvaluatorConfigData = { + name: values.name, + description: values.description, + tags: values.tags, + evaluator_key: selectedEvaluator.key, + parameters, } if (editMode) { - await updateEvaluatorConfig(editEvalEditValues?.id!, data) - - // Update atom with merged values - const updatedConfig = editEvalEditValues - ? { - ...editEvalEditValues, - ...data, - settings_values: settingsValues, - } - : null - if (updatedConfig) { - commitPlayground(updatedConfig) - } + const updatedEvaluator = await updateEvaluatorConfig(editEvalEditValues?.id!, { + id: editEvalEditValues?.id!, + name: values.name, + description: editEvalEditValues?.description, + tags: editEvalEditValues?.tags, + meta: editEvalEditValues?.meta, + flags: editEvalEditValues?.flags, + data: { + ...(editEvalEditValues?.data ?? {}), + parameters: mergedParameters, + }, + }) + + commitPlayground(updatedEvaluator) } else { - const response = await createEvaluatorConfig(appId, data) - const createdConfig = response?.data - - if (createdConfig) { - // Use commitPlayground to update state and switch to edit mode - commitPlayground(createdConfig) - if (uiVariant === "page" && createdConfig.id) { - await router.replace( - `${projectURL}/evaluators/configure/${encodeURIComponent( - createdConfig.id, - )}`, - ) - } + const createdConfig = await createEvaluatorConfig(appId, payload) + + // Use commitPlayground to update state and switch to edit mode + commitPlayground(createdConfig) + if (uiVariant === "page" && createdConfig.id) { + await router.replace( + `${projectURL}/evaluators/configure/${encodeURIComponent( + createdConfig.id, + )}`, + ) } } @@ -381,15 +386,15 @@ const ConfigureEvaluator = ({ form.resetFields() if (editMode && editEvalEditValues) { - // Load all values including nested settings_values + // Load all values including nested parameters form.setFieldsValue({ ...editEvalEditValues, - settings_values: editEvalEditValues.settings_values || {}, + parameters: editEvalEditValues.data?.parameters || {}, }) } else if (cloneConfig && editEvalEditValues) { - // When cloning, copy only settings_values and clear the name so user provides a new name + // When cloning, copy only parameters and clear the name so user provides a new name form.setFieldsValue({ - settings_values: editEvalEditValues.settings_values || {}, + parameters: editEvalEditValues.data?.parameters || {}, name: "", }) } else if (selectedEvaluator?.settings_template) { @@ -404,7 +409,7 @@ const ConfigureEvaluator = ({ } if (Object.keys(defaultSettings).length > 0) { form.setFieldsValue({ - settings_values: defaultSettings, + parameters: defaultSettings, }) } } @@ -556,7 +561,7 @@ const ConfigureEvaluator = ({ key={field.key} traceTree={traceTree} form={form} - name={["settings_values", field.key]} + name={["parameters", field.key]} /> ))} @@ -674,7 +679,7 @@ const ConfigureEvaluator = ({ key={field.key} traceTree={traceTree} form={form} - name={["settings_values", field.key]} + name={["parameters", field.key]} /> ))} diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts index 76b8c134c2..dcb15dcd42 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/state/atoms.ts @@ -18,7 +18,7 @@ import type {FormInstance} from "antd" import {atom} from "jotai" import {atomWithReset, atomWithStorage, RESET} from "jotai/utils" -import type {Evaluator, EvaluatorConfig, Variant} from "@/oss/lib/Types" +import type {Evaluator, SimpleEvaluator, Variant} from "@/oss/lib/Types" import {stringStorage} from "@/oss/state/utils/stringStorage" // ================================================================ @@ -84,7 +84,7 @@ export const playgroundIsCloneModeAtom = atom((get) => get(playgroundSessionAtom * - In edit mode: loaded from existing config * - In clone mode: copied from source config (with cleared name) */ -export const playgroundEditValuesAtom = atomWithReset(null) +export const playgroundEditValuesAtom = atomWithReset(null) // ================================================================ // FORM STATE @@ -95,7 +95,7 @@ export const playgroundEditValuesAtom = atomWithReset(nu * Allows DebugSection to read form values for running the evaluator * * This is set by ConfigureEvaluator when the form mounts - * and read by DebugSection to get current settings_values + * and read by DebugSection to get current parameters */ export const playgroundFormRefAtom = atom(null) @@ -179,7 +179,7 @@ export const initPlaygroundAtom = atom( set, payload: { evaluator: Evaluator - existingConfig?: EvaluatorConfig | null + existingConfig?: SimpleEvaluator | null mode?: PlaygroundMode }, ) => { @@ -226,7 +226,7 @@ export const resetPlaygroundAtom = atom(null, (get, set) => { * * @param savedConfig - The config returned from the API */ -export const commitPlaygroundAtom = atom(null, (get, set, savedConfig: EvaluatorConfig) => { +export const commitPlaygroundAtom = atom(null, (get, set, savedConfig: SimpleEvaluator) => { // Update edit values with saved config set(playgroundEditValuesAtom, savedConfig) @@ -280,7 +280,7 @@ export const openEvaluatorDrawerAtom = atom( set, payload: { evaluator: Evaluator - existingConfig?: EvaluatorConfig | null + existingConfig?: SimpleEvaluator | null mode?: PlaygroundMode }, ) => { diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx index 0ac235b386..c30bb3c1f1 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx @@ -5,11 +5,11 @@ import {Modal, Space, theme, Typography} from "antd" import {createUseStyles} from "react-jss" import {checkIfResourceValidForDeletion} from "@/oss/lib/evaluations/legacy" -import {EvaluatorConfig, JSSTheme} from "@/oss/lib/Types" +import {JSSTheme, SimpleEvaluator} from "@/oss/lib/Types" import {deleteEvaluatorConfig} from "@/oss/services/evaluations/api" type DeleteModalProps = { - selectedEvalConfig: EvaluatorConfig + selectedEvalConfig: SimpleEvaluator onSuccess: () => void } & React.ComponentProps diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx index f3c9434a38..72aaf034fc 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx @@ -7,18 +7,19 @@ import {useAtom} from "jotai" import {createUseStyles} from "react-jss" import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import {formatDay} from "@/oss/lib/helpers/dateTimeHelper" -import {Evaluator, EvaluatorConfig, JSSTheme} from "@/oss/lib/Types" +import {Evaluator, JSSTheme, SimpleEvaluator} from "@/oss/lib/Types" import DeleteModal from "./DeleteModal" interface EvaluatorCardProps { - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] setEditMode: React.Dispatch> setCloneConfig: React.Dispatch> setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> - setEditEvalEditValues: React.Dispatch> + setEditEvalEditValues: React.Dispatch> onSuccess: () => void } @@ -88,22 +89,21 @@ const EvaluatorCard = ({ const classes = useStyles() const evaluators = useAtom(evaluatorsAtom)[0] const [openDeleteModal, setOpenDeleteModal] = useState(false) - const [selectedDelEval, setSelectedDelEval] = useState(null) + const [selectedDelEval, setSelectedDelEval] = useState(null) return (
{evaluatorConfigs.length ? ( evaluatorConfigs.map((item) => { - const evaluator = evaluators.find((e) => e.key === item.evaluator_key) + const evaluatorKey = resolveEvaluatorKey(item) + const evaluator = evaluators.find((e) => e.key === evaluatorKey) return ( { - const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, - ) + const selectedEval = evaluators.find((e) => e.key === evaluatorKey) if (selectedEval) { setEditMode(true) setSelectedEvaluator(selectedEval) @@ -130,7 +130,7 @@ const EvaluatorCard = ({ onClick: (e: any) => { e.domEvent.stopPropagation() const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, + (e) => e.key === evaluatorKey, ) if (selectedEval) { setEditMode(true) @@ -147,7 +147,7 @@ const EvaluatorCard = ({ onClick: (e: any) => { e.domEvent.stopPropagation() const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, + (e) => e.key === evaluatorKey, ) if (selectedEval) { setCloneConfig(true) diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx index 2e38bfd1c2..33c03a9f89 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx @@ -7,17 +7,18 @@ import {ColumnsType} from "antd/es/table" import {useAtom} from "jotai" import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import DeleteModal from "./DeleteModal" interface EvaluatorListProps { - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] setEditMode: React.Dispatch> setCloneConfig: React.Dispatch> setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> - setEditEvalEditValues: React.Dispatch> + setEditEvalEditValues: React.Dispatch> onSuccess: () => void } @@ -32,9 +33,9 @@ const EvaluatorList = ({ }: EvaluatorListProps) => { const evaluators = useAtom(evaluatorsAtom)[0] const [openDeleteModal, setOpenDeleteModal] = useState(false) - const [selectedDelEval, setSelectedDelEval] = useState(null) + const [selectedDelEval, setSelectedDelEval] = useState(null) - const columns: ColumnsType = [ + const columns: ColumnsType = [ // { // title: "Version", // dataIndex: "version", @@ -56,7 +57,8 @@ const EvaluatorList = ({ dataIndex: "type", key: "type", render: (_, record) => { - const evaluator = evaluators.find((item) => item.key === record.evaluator_key) + const evaluatorKey = resolveEvaluatorKey(record) + const evaluator = evaluators.find((item) => item.key === evaluatorKey) return {evaluator?.name} }, }, @@ -84,8 +86,9 @@ const EvaluatorList = ({ icon: , onClick: (e: any) => { e.domEvent.stopPropagation() + const evaluatorKey = resolveEvaluatorKey(record) const selectedEval = evaluators.find( - (e) => e.key === record.evaluator_key, + (e) => e.key === evaluatorKey, ) if (selectedEval) { setEditMode(true) @@ -101,8 +104,9 @@ const EvaluatorList = ({ icon: , onClick: (e: any) => { e.domEvent.stopPropagation() + const evaluatorKey = resolveEvaluatorKey(record) const selectedEval = evaluators.find( - (e) => e.key === record.evaluator_key, + (e) => e.key === evaluatorKey, ) if (selectedEval) { setCloneConfig(true) @@ -151,7 +155,8 @@ const EvaluatorList = ({ onRow={(record) => ({ style: {cursor: "pointer"}, onClick: () => { - const selectedEval = evaluators.find((e) => e.key === record.evaluator_key) + const evaluatorKey = resolveEvaluatorKey(record) + const selectedEval = evaluators.find((e) => e.key === evaluatorKey) if (selectedEval) { setEditMode(true) setSelectedEvaluator(selectedEval) diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx index 60569766c2..564bc38df9 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx @@ -8,21 +8,22 @@ import {createUseStyles} from "react-jss" import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation" import {getEvaluatorTags} from "@/oss/lib/evaluations/legacy" -import {Evaluator, EvaluatorConfig, JSSTheme} from "@/oss/lib/Types" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" +import {Evaluator, JSSTheme, SimpleEvaluator} from "@/oss/lib/Types" import {nonArchivedEvaluatorsAtom} from "@/oss/state/evaluators" import EvaluatorCard from "./EvaluatorCard" import EvaluatorList from "./EvaluatorList" interface EvaluatorsProps { - evaluatorConfigs: EvaluatorConfig[] + evaluatorConfigs: SimpleEvaluator[] handleOnCancel: () => void setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> fetchingEvalConfigs: boolean setEditMode: React.Dispatch> setCloneConfig: React.Dispatch> - setEditEvalEditValues: React.Dispatch> + setEditEvalEditValues: React.Dispatch> onSuccess: () => void setEvaluatorsDisplay: any evaluatorsDisplay: string @@ -95,10 +96,13 @@ const Evaluators = ({ const updatedEvaluatorConfigs = useMemo(() => { return evaluatorConfigs.map((config) => { - const matchingEvaluator = evaluators.find( - (evaluator) => evaluator.key === config.evaluator_key, + const evaluatorKey = resolveEvaluatorKey(config) + const matchingEvaluator = evaluators.find((evaluator) => evaluator.key === evaluatorKey) + if (!matchingEvaluator) return config + const nextTags = Array.from( + new Set([...(config.tags || []), ...(matchingEvaluator.tags || [])]), ) - return matchingEvaluator ? {...config, tags: matchingEvaluator.tags} : config + return {...config, tags: nextTags} }) }, [evaluatorConfigs, evaluators]) @@ -111,7 +115,7 @@ const Evaluators = ({ if (searchTerm) { filtered = filtered.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), + (item.name || "").toLowerCase().includes(searchTerm.toLowerCase()), ) } diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index b89da2ee19..c06202394c 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -9,7 +9,7 @@ import EnhancedModal from "@/oss/components/EnhancedUIs/Modal" import {useAppId} from "@/oss/hooks/useAppId" import {evaluatorConfigsAtom} from "@/oss/lib/atoms/evaluation" import useFetchEvaluatorsData from "@/oss/lib/hooks/useFetchEvaluatorsData" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import ConfigureEvaluator from "./ConfigureEvaluator" import {initPlaygroundAtom, resetPlaygroundAtom} from "./ConfigureEvaluator/state/atoms" @@ -39,7 +39,7 @@ const EvaluatorsModal = ({ useFetchEvaluatorsData({appId: appId ?? ""}) const [editMode, setEditMode] = useState(false) const [cloneConfig, setCloneConfig] = useState(false) - const [editEvalEditValues, setEditEvalEditValues] = useState(null) + const [editEvalEditValues, setEditEvalEditValues] = useState(null) const [evaluatorsDisplay, setEvaluatorsDisplay] = useLocalStorage<"card" | "list">( "evaluator_view", "list", diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx b/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx index 9687626c3c..d0281e7539 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/OnlineEvaluationDrawer.tsx @@ -63,7 +63,7 @@ const OnlineEvaluationDrawer = ({open, onClose, onCreate}: OnlineEvaluationDrawe const filterColumns = useMemo(() => getFilterColumns(), []) const [filters, setFilters] = useAtom(onlineEvalFiltersAtom) const resetFilters = useSetAtom(resetOnlineEvalFiltersAtom) - // Load preview evaluators (with IDs) to map evaluator_config.evaluator_key -> evaluator.id + // Load preview evaluators (with IDs) to map config URI key -> evaluator.id const previewEvaluatorsSwr = useEvaluators({preview: true, queries: {is_human: false}}) const baseEvaluators = (baseEvaluatorsSwr.data as Evaluator[] | undefined) ?? [] const evaluators = useAtomValue(evaluatorConfigsAtom) diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts index a49787e814..42612e4322 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts @@ -1,4 +1,5 @@ import {useMemo} from "react" +import {useMemo} from "react" import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types" @@ -47,18 +48,41 @@ const mergeEvaluatorWithConfig = ( ...configAny, } - const previewSettings = isPlainObject(evaluatorAny.settings_values) - ? (evaluatorAny.settings_values as Record) + const previewData = isPlainObject(evaluatorAny.data) + ? (evaluatorAny.data as Record) : undefined - const configSettings = isPlainObject(configAny.settings_values) - ? (configAny.settings_values as Record) + const configData = isPlainObject(configAny.data) + ? (configAny.data as Record) : undefined + if (previewData || configData) { + const mergedData: Record = { + ...(previewData ?? {}), + ...(configData ?? {}), + } - if (previewSettings || configSettings) { - merged.settings_values = { + const previewParameters = isPlainObject(previewData?.parameters) + ? (previewData?.parameters as Record) + : undefined + const configParameters = isPlainObject(configData?.parameters) + ? (configData?.parameters as Record) + : undefined + const previewSettings = isPlainObject(evaluatorAny.settings_values) + ? (evaluatorAny.settings_values as Record) + : undefined + const configSettings = isPlainObject(configAny.settings_values) + ? (configAny.settings_values as Record) + : undefined + const mergedParameters = { + ...(previewParameters ?? {}), ...(previewSettings ?? {}), + ...(configParameters ?? {}), ...(configSettings ?? {}), } + if (Object.keys(mergedParameters).length) { + mergedData.parameters = mergedParameters + } + + merged.data = mergedData } return merged as EvaluatorPreviewDto diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx index af624b9f83..c1cf18acd2 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx @@ -1,9 +1,11 @@ import {useMemo} from "react" +import {useMemo} from "react" import {SelectProps} from "antd" +import {getEvaluatorParameters, resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types" -import type {Evaluator} from "@/oss/lib/Types" +import type {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import { ALLOWED_ONLINE_EVALUATOR_KEYS, @@ -13,7 +15,7 @@ import { import {capitalize, collectEvaluatorCandidates} from "../utils/evaluatorDetails" interface UseEvaluatorSelectionParams { - evaluators: any[] + evaluators: SimpleEvaluator[] selectedEvaluatorId: string | undefined previewEvaluators: EvaluatorPreviewDto[] baseEvaluators: Evaluator[] @@ -21,16 +23,17 @@ interface UseEvaluatorSelectionParams { interface EvaluatorSelectionResult { evaluatorOptions: SelectProps["options"] - selectedEvaluatorConfig?: any + selectedEvaluatorConfig?: SimpleEvaluator matchedPreviewEvaluator?: EvaluatorPreviewDto evaluatorTypeLookup: Map } -const buildEvaluatorOptions = (configs: any[]): SelectProps["options"] => +const buildEvaluatorOptions = (configs: SimpleEvaluator[]): SelectProps["options"] => (configs || []).map((cfg: any) => { const iconSrc = (cfg?.icon_url && (cfg.icon_url.src || cfg.icon_url)) || undefined const displayName = cfg?.name || "" - const searchable = [displayName, cfg?.evaluator_key, cfg?.id] + const evaluatorKey = resolveEvaluatorKey(cfg) + const searchable = [displayName, evaluatorKey, cfg?.id, cfg?.slug, cfg?.data?.uri] .map((item) => { if (item === undefined || item === null) return undefined const text = String(item).trim() @@ -61,6 +64,7 @@ const buildPreviewLookup = (previewEvaluators: EvaluatorPreviewDto[]) => { const map = new Map() previewEvaluators.forEach((evaluator) => { const rawKey = + resolveEvaluatorKey(evaluator as any) || (evaluator as any)?.evaluator_key || (evaluator as any)?.flags?.evaluator_key || (evaluator as any)?.meta?.evaluator_key || @@ -122,13 +126,14 @@ export const useEvaluatorSelection = ({ const allowedEvaluators = useMemo(() => { if (!evaluators?.length) return [] - return evaluators.filter((config: any) => { + return evaluators.filter((config: SimpleEvaluator) => { if (!config) return false + const evaluatorKey = resolveEvaluatorKey(config) const candidates = collectEvaluatorCandidates( - config?.evaluator_key, - (config as any)?.slug, + evaluatorKey, + config?.slug, config?.name, - config?.key, + (config as any)?.key, config?.meta?.evaluator_key, config?.meta?.key, ) @@ -141,13 +146,13 @@ export const useEvaluatorSelection = ({ if (!allowedEvaluators.length) return [] if (!ENABLE_CORRECT_ANSWER_KEY_FILTER) return allowedEvaluators const requiringKey = evaluatorsRequiringCorrectAnswerKey ?? new Set() - return allowedEvaluators.filter((config: any) => { + return allowedEvaluators.filter((config: SimpleEvaluator) => { if (!config) return false - const evaluatorKey = config?.evaluator_key + const evaluatorKey = resolveEvaluatorKey(config) if (evaluatorKey && requiringKey.has(evaluatorKey)) { return false } - const settingsValues = config?.settings_values || {} + const settingsValues = getEvaluatorParameters(config) const requiresCorrectAnswerKey = Object.entries(settingsValues).some(([key, value]) => { if (!key) return false const normalizedKey = key.toLowerCase() @@ -176,7 +181,7 @@ export const useEvaluatorSelection = ({ const previewLookup = useMemo(() => buildPreviewLookup(previewEvaluators), [previewEvaluators]) const matchedPreviewEvaluator = useMemo(() => { - const key = (selectedEvaluatorConfig as any)?.evaluator_key as string | undefined + const key = resolveEvaluatorKey(selectedEvaluatorConfig) if (!key) return undefined return previewLookup.get(key.toLowerCase()) }, [selectedEvaluatorConfig, previewLookup]) diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts index fb54e0978b..9cab865352 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts @@ -1,8 +1,10 @@ import {useMemo} from "react" +import {useMemo} from "react" import {useAtomValue} from "jotai" import {evaluatorConfigsAtom} from "@/oss/lib/atoms/evaluation" +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import useEvaluatorConfigs from "@/oss/lib/hooks/useEvaluatorConfigs" import {EVALUATOR_CATEGORY_LABEL_MAP} from "../constants" @@ -25,6 +27,7 @@ export const useEvaluatorTypeFromConfigs = ({ } const candidates = collectEvaluatorCandidates( + resolveEvaluatorKey(evaluator as any), (evaluator as any)?.slug, (evaluator as any)?.key, (evaluator as any)?.meta?.evaluator_key, @@ -32,7 +35,7 @@ export const useEvaluatorTypeFromConfigs = ({ ) const match = configs.find((cfg) => { - const key = (cfg?.evaluator_key || cfg?.name || cfg?.id || "").toString().trim() + const key = (resolveEvaluatorKey(cfg) || cfg?.name || cfg?.id || "").toString().trim() if (!key) return false const lower = key.toLowerCase() if (candidates.includes(lower)) return true @@ -63,7 +66,7 @@ export const useEvaluatorTypeFromConfigs = ({ // 2) Infer label by scanning evaluator_key/name tokens for known category slugs const categorySlugs = Object.keys(EVALUATOR_CATEGORY_LABEL_MAP || {}) const keyTokens = [ - (match as any)?.evaluator_key, + resolveEvaluatorKey(match), (match as any)?.name, (evaluator as any)?.key, (evaluator as any)?.name, diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts index fddb7511af..fdbd26e16f 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/utils/evaluatorDetails.ts @@ -1,3 +1,4 @@ +import {resolveEvaluatorKey} from "@/oss/lib/evaluators/utils" import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types" import { @@ -126,6 +127,7 @@ export const extractEvaluatorType = ( } const candidates = collectEvaluatorCandidates( + resolveEvaluatorKey(evaluator as any), (evaluator as any)?.slug, (evaluator as any)?.key, (evaluator as any)?.name, @@ -290,8 +292,8 @@ export const extractParameterList = (evaluator?: EvaluatorPreviewDto): Parameter // Support both simple preview artifacts and workflow evaluators const parameterSources = [ - (evaluator as any)?.settings_values, (evaluator as any)?.data?.parameters, + (evaluator as any)?.settings_values, (evaluator as any)?.data?.service?.configuration?.parameters, (evaluator as any)?.data?.configuration?.parameters, ] @@ -359,8 +361,8 @@ export const extractModelName = (evaluator?: EvaluatorPreviewDto) => { } const sources = [ - (evaluator as any)?.settings_values, (evaluator as any)?.data?.parameters, + (evaluator as any)?.settings_values, (evaluator as any)?.data?.service?.configuration, (evaluator as any)?.data?.service?.configuration?.parameters, (evaluator as any)?.data?.configuration, @@ -660,7 +662,8 @@ const normalizeMessageContent = ( export const extractPromptSections = (evaluator?: EvaluatorPreviewDto): PromptPreviewSection[] => { if (!evaluator) return [] const data = (evaluator as any)?.data ?? {} - const settings = (evaluator as any)?.settings_values + const parameters = data?.parameters + const settings = parameters ?? (evaluator as any)?.settings_values const agConfig = data?.parameters?.ag_config ?? data?.parameters?.agConfig const messages = findFirstMessages(settings) ?? @@ -728,7 +731,6 @@ export const extractPromptSections = (evaluator?: EvaluatorPreviewDto): PromptPr const promptSources = [ settings, - data?.parameters, data?.service?.configuration?.parameters, data?.configuration?.parameters, ] diff --git a/web/oss/src/lib/Types.ts b/web/oss/src/lib/Types.ts index 74f6f31a51..2bd357fc60 100644 --- a/web/oss/src/lib/Types.ts +++ b/web/oss/src/lib/Types.ts @@ -870,6 +870,76 @@ export interface Evaluator { archived?: boolean } +export interface SimpleEvaluatorData { + version?: string + uri?: string + url?: string + headers?: Record + schemas?: Record + script?: {content?: string; runtime?: string} + parameters?: Record + service?: Record + configuration?: Record +} + +export interface SimpleEvaluatorFlags { + is_custom?: boolean + is_evaluator?: boolean + is_human?: boolean + requires_llm_api_keys?: boolean + evaluator_key?: string + color?: string +} + +export interface SimpleEvaluator { + id: string + slug: string + name?: string + description?: string + tags?: string[] + meta?: Record + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData + created_at?: string + updated_at?: string + deleted_at?: string | null + created_by_id?: string + updated_by_id?: string + deleted_by_id?: string + color?: string + icon_url?: string | StaticImageData +} + +export interface SimpleEvaluatorCreate { + slug: string + name?: string + description?: string + tags?: string[] + meta?: Record + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData +} + +export interface SimpleEvaluatorEdit { + id: string + name?: string + description?: string + tags?: string[] + meta?: Record + flags?: SimpleEvaluatorFlags + data?: SimpleEvaluatorData +} + +export interface SimpleEvaluatorResponse { + count: number + evaluator: SimpleEvaluator | null +} + +export interface SimpleEvaluatorsResponse { + count: number + evaluators: SimpleEvaluator[] +} + export interface EvaluatorConfig { id: string evaluator_key: string diff --git a/web/oss/src/lib/atoms/evaluation.ts b/web/oss/src/lib/atoms/evaluation.ts index 323dde41cb..1fbc0039ad 100644 --- a/web/oss/src/lib/atoms/evaluation.ts +++ b/web/oss/src/lib/atoms/evaluation.ts @@ -1,6 +1,6 @@ import {atom} from "jotai" -import {Evaluation, EvaluationScenario, Evaluator, EvaluatorConfig} from "../Types" +import {Evaluation, EvaluationScenario, Evaluator, SimpleEvaluator} from "../Types" export const evaluationAtom = atom(undefined) @@ -8,4 +8,4 @@ export const evaluationScenariosAtom = atom([]) export const evaluatorsAtom = atom([]) -export const evaluatorConfigsAtom = atom([]) +export const evaluatorConfigsAtom = atom([]) diff --git a/web/oss/src/lib/evaluators/utils.ts b/web/oss/src/lib/evaluators/utils.ts new file mode 100644 index 0000000000..e21d98a62e --- /dev/null +++ b/web/oss/src/lib/evaluators/utils.ts @@ -0,0 +1,80 @@ +import type {SimpleEvaluator, SimpleEvaluatorData} from "@/oss/lib/Types" + +const normalizeSlugBase = (value?: string | null) => + String(value ?? "") + .trim() + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + +const trimVersionSuffix = (value: string) => value.replace(/-v\d+$/i, "") + +export const extractEvaluatorKeyFromUri = (uri?: string | null): string | undefined => { + if (!uri) return undefined + const trimmed = uri.trim() + if (!trimmed) return undefined + + const builtinMatch = trimmed.match(/^agenta:builtin:([^:]+)(:|$)/i) + if (builtinMatch?.[1]) { + return trimVersionSuffix(builtinMatch[1]) + } + + const parts = trimmed.split(":").filter(Boolean) + if (parts.length >= 3 && parts[2]) { + return trimVersionSuffix(parts[2]) + } + + const slashParts = trimmed.split("/").filter(Boolean) + const lastSegment = slashParts[slashParts.length - 1] + if (lastSegment) { + return trimVersionSuffix(lastSegment) + } + + return undefined +} + +export const resolveEvaluatorKey = ( + evaluator?: Partial | null, +): string | undefined => { + if (!evaluator) return undefined + + const candidate = + extractEvaluatorKeyFromUri(evaluator.data?.uri) || + (typeof (evaluator as any)?.evaluator_key === "string" + ? (evaluator as any).evaluator_key + : undefined) || + (typeof evaluator.meta?.evaluator_key === "string" + ? evaluator.meta.evaluator_key + : undefined) || + (typeof evaluator.flags?.evaluator_key === "string" + ? evaluator.flags.evaluator_key + : undefined) || + (typeof (evaluator as any)?.key === "string" ? (evaluator as any).key : undefined) + + return candidate ? String(candidate).trim() : undefined +} + +export const buildEvaluatorUri = (evaluatorKey: string, version = "v0") => + `agenta:builtin:${evaluatorKey}:${version}` + +export const buildEvaluatorSlug = (name?: string | null) => { + const base = normalizeSlugBase(name) || "evaluator" + const suffix = Math.random().toString(36).slice(2, 8) + const maxBaseLength = Math.max(1, 50 - suffix.length - 1) + const trimmedBase = base.slice(0, maxBaseLength) + return `${trimmedBase}-${suffix}` +} + +export const mergeEvaluatorData = ( + base?: SimpleEvaluatorData | null, + updates?: Partial | null, +): SimpleEvaluatorData | undefined => { + if (!base && !updates) return undefined + return { + ...(base ?? {}), + ...(updates ?? {}), + } +} + +export const getEvaluatorParameters = (evaluator?: Partial | null) => + (evaluator?.data?.parameters as Record) || {} diff --git a/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts b/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts index 3765eb6677..998f65459e 100644 --- a/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts +++ b/web/oss/src/lib/hooks/useEvaluatorConfigs/index.ts @@ -6,11 +6,11 @@ import {SWRConfiguration} from "swr" import {useAppId} from "@/oss/hooks/useAppId" import {evaluatorConfigsQueryAtomFamily} from "@/oss/state/evaluators" -import {EvaluatorConfig} from "../../Types" +import {SimpleEvaluator} from "../../Types" type EvaluatorConfigResult = Preview extends true ? undefined - : EvaluatorConfig[] + : SimpleEvaluator[] type EvaluatorConfigsOptions = { preview?: Preview diff --git a/web/oss/src/services/evaluations/api/index.ts b/web/oss/src/services/evaluations/api/index.ts index 43bfdb3ca8..9702d501b5 100644 --- a/web/oss/src/services/evaluations/api/index.ts +++ b/web/oss/src/services/evaluations/api/index.ts @@ -17,7 +17,7 @@ export { createEvaluatorConfig, updateEvaluatorConfig, deleteEvaluatorConfig, - type CreateEvaluationConfigData, + type CreateEvaluatorConfigData, } from "@/oss/services/evaluators" //Prefix convention: diff --git a/web/oss/src/services/evaluators/index.ts b/web/oss/src/services/evaluators/index.ts index 2a9bb15de7..9a85ae7c8c 100644 --- a/web/oss/src/services/evaluators/index.ts +++ b/web/oss/src/services/evaluators/index.ts @@ -1,9 +1,21 @@ import axios from "@/oss/lib/api/assets/axiosConfig" +import { + buildEvaluatorSlug, + buildEvaluatorUri, + resolveEvaluatorKey, +} from "@/oss/lib/evaluators/utils" import {getAgentaApiUrl} from "@/oss/lib/helpers/api" import {getTagColors} from "@/oss/lib/helpers/colors" import {isDemo, stringToNumberInRange} from "@/oss/lib/helpers/utils" import {EvaluatorResponseDto} from "@/oss/lib/hooks/useEvaluators/types" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import { + Evaluator, + SimpleEvaluator, + SimpleEvaluatorCreate, + SimpleEvaluatorEdit, + SimpleEvaluatorResponse, + SimpleEvaluatorsResponse, +} from "@/oss/lib/Types" import aiImg from "@/oss/media/artificial-intelligence.png" import bracketCurlyImg from "@/oss/media/bracket-curly.png" import codeImg from "@/oss/media/browser.png" @@ -48,7 +60,7 @@ export const updateEvaluator = async ( } } -export const fetchEvaluatorById = async (evaluatorId: string) => { +export const fetchEvaluatorById = async (evaluatorId: string): Promise => { const {projectId} = getProjectValues() if (!projectId) { return null @@ -59,7 +71,7 @@ export const fetchEvaluatorById = async (evaluatorId: string) => { ) const payload = (response?.data as any)?.evaluator ?? response?.data ?? null if (!payload) return null - return payload as EvaluatorResponseDto<"response">["evaluator"] + return decorateSimpleEvaluator(payload as SimpleEvaluator) } const evaluatorIconsMap = { @@ -103,58 +115,112 @@ export const fetchAllEvaluators = async (includeArchived = false) => { } // Evaluator Configs +function decorateSimpleEvaluator(evaluator: SimpleEvaluator) { + const tagColors = getTagColors() + const evaluatorKey = resolveEvaluatorKey(evaluator) + if (!evaluatorKey) return evaluator + + return { + ...evaluator, + icon_url: evaluatorIconsMap[evaluatorKey as keyof typeof evaluatorIconsMap], + color: tagColors[stringToNumberInRange(evaluatorKey, 0, tagColors.length - 1)], + } +} + export const fetchAllEvaluatorConfigs = async ( appId?: string | null, projectIdOverride?: string | null, -) => { - const tagColors = getTagColors() +): Promise => { const {projectId: projectIdFromStore} = getProjectValues() const projectId = projectIdOverride ?? projectIdFromStore + void appId if (!projectId) { - return [] as EvaluatorConfig[] + return [] as SimpleEvaluator[] } - const response = await axios.get("/evaluators/configs", { - params: { - project_id: projectId, - ...(appId ? {app_id: appId} : {}), + const response = await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/query?project_id=${projectId}`, + { + evaluator: { + flags: { + is_evaluator: true, + is_human: false, + }, + }, + include_archived: false, }, - }) - const evaluatorConfigs = (response.data || []).map((item: EvaluatorConfig) => ({ - ...item, - icon_url: evaluatorIconsMap[item.evaluator_key as keyof typeof evaluatorIconsMap], - color: tagColors[stringToNumberInRange(item.evaluator_key, 0, tagColors.length - 1)], - })) as EvaluatorConfig[] - return evaluatorConfigs + ) + + const evaluators = response.data?.evaluators ?? [] + return evaluators.filter((item) => !item.deleted_at).map(decorateSimpleEvaluator) +} + +export interface CreateEvaluatorConfigData { + name: string + evaluator_key: string + parameters: Record + tags?: string[] + description?: string } -export type CreateEvaluationConfigData = Omit export const createEvaluatorConfig = async ( _appId: string | null | undefined, - config: CreateEvaluationConfigData, -) => { + config: CreateEvaluatorConfigData, +): Promise => { const {projectId} = getProjectValues() void _appId - return axios.post(`/evaluators/configs?project_id=${projectId}`, { - ...config, - }) + const payload: SimpleEvaluatorCreate = { + slug: buildEvaluatorSlug(config.name), + name: config.name, + description: config.description, + tags: config.tags, + flags: {is_evaluator: true, is_human: false}, + data: { + uri: buildEvaluatorUri(config.evaluator_key), + parameters: config.parameters, + }, + } + + const response = await axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/?project_id=${projectId}`, + {evaluator: payload}, + ) + + const evaluator = response.data?.evaluator + if (!evaluator) { + throw new Error("Failed to create evaluator") + } + + return decorateSimpleEvaluator(evaluator) } export const updateEvaluatorConfig = async ( configId: string, - config: Partial, -) => { + config: SimpleEvaluatorEdit, +): Promise => { const {projectId} = getProjectValues() - return axios.put(`/evaluators/configs/${configId}?project_id=${projectId}`, config) + const response = await axios.put( + `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}?project_id=${projectId}`, + {evaluator: {...config, id: configId}}, + ) + + const evaluator = response.data?.evaluator + if (!evaluator) { + throw new Error("Failed to update evaluator") + } + + return decorateSimpleEvaluator(evaluator) } export const deleteEvaluatorConfig = async (configId: string) => { const {projectId} = getProjectValues() - return axios.delete(`/evaluators/configs/${configId}?project_id=${projectId}`) + return axios.post( + `${getAgentaApiUrl()}/preview/simple/evaluators/${configId}/archive?project_id=${projectId}`, + ) } export const deleteHumanEvaluator = async (evaluatorId: string) => { diff --git a/web/oss/src/state/evaluators/atoms.ts b/web/oss/src/state/evaluators/atoms.ts index 24f390e884..5c6b34c9e2 100644 --- a/web/oss/src/state/evaluators/atoms.ts +++ b/web/oss/src/state/evaluators/atoms.ts @@ -5,6 +5,7 @@ import {atomWithQuery} from "jotai-tanstack-query" import {getMetricsFromEvaluator} from "@/oss/components/SharedDrawers/AnnotateDrawer/assets/transforms" import axios from "@/oss/lib/api/assets/axiosConfig" import {evaluatorsAtom} from "@/oss/lib/atoms/evaluation" +import {extractEvaluatorKeyFromUri} from "@/oss/lib/evaluators/utils" import {transformApiData} from "@/oss/lib/hooks/useAnnotations/assets/transformer" import { EvaluatorDto, @@ -13,7 +14,7 @@ import { EvaluatorRevisionsResponseDto, EvaluatorsResponseDto, } from "@/oss/lib/hooks/useEvaluators/types" -import {Evaluator, EvaluatorConfig} from "@/oss/lib/Types" +import {Evaluator, SimpleEvaluator} from "@/oss/lib/Types" import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/oss/services/evaluators" import {selectedAppIdAtom} from "@/oss/state/app" import {selectedOrgAtom} from "@/oss/state/org" @@ -26,16 +27,15 @@ import {EvaluatorConfigsParams, EvaluatorsParams} from "./types" const extractKeyFromUri = (uri: unknown): string | undefined => { if (typeof uri !== "string") return undefined - const match = uri.match(/[:/](auto_[a-z0-9_]+)/i) - if (match?.[1]) return match[1] - const parts = uri.split(":").filter(Boolean) - if (parts.length) { - const candidate = parts[parts.length - 1] - if (candidate) { - return candidate.replace(/-v\d+$/i, "") - } - } - return undefined + return ( + extractEvaluatorKeyFromUri(uri) || + uri.match(/[:/](auto_[a-z0-9_]+)/i)?.[1] || + uri + .split(":") + .filter(Boolean) + .slice(-1)[0] + ?.replace(/-v\d+$/i, "") + ) } const isPlainObject = (value: unknown): value is Record => { @@ -102,7 +102,7 @@ const extractRequiresLlmApiKeys = (source: unknown): boolean | undefined => { export const evaluatorConfigsQueryAtomFamily = atomFamily( ({projectId: overrideProjectId, appId: overrideAppId, preview}: EvaluatorConfigsParams = {}) => - atomWithQuery((get) => { + atomWithQuery((get) => { const projectId = overrideProjectId || get(projectIdAtom) const appId = overrideAppId || get(selectedAppIdAtom) const user = get(userAtom) as {id?: string} | null From 02ad4dcbea427c445023b644cf73b22194763ba6 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Wed, 28 Jan 2026 13:00:57 +0100 Subject: [PATCH 4/4] fix(frontend): remove duplicate hook imports --- .../evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts | 1 - .../evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx | 1 - .../onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts | 1 - 3 files changed, 3 deletions(-) diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts index 42612e4322..0545163cae 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorDetails.ts @@ -1,5 +1,4 @@ import {useMemo} from "react" -import {useMemo} from "react" import type {EvaluatorPreviewDto} from "@/oss/lib/hooks/useEvaluators/types" diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx index c1cf18acd2..d5e724dcef 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorSelection.tsx @@ -1,5 +1,4 @@ import {useMemo} from "react" -import {useMemo} from "react" import {SelectProps} from "antd" diff --git a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts index 9cab865352..3cef385d5d 100644 --- a/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts +++ b/web/oss/src/components/pages/evaluations/onlineEvaluation/hooks/useEvaluatorTypeFromConfigs.ts @@ -1,5 +1,4 @@ import {useMemo} from "react" -import {useMemo} from "react" import {useAtomValue} from "jotai"